| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9929627023223082, | |
| "eval_steps": 500, | |
| "global_step": 236, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00844475721323012, | |
| "grad_norm": 2.089867158089946, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 2.161, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0422237860661506, | |
| "grad_norm": 1.5848399731293203, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 2.0465, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0844475721323012, | |
| "grad_norm": 0.7578527824638689, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 1.5984, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1266713581984518, | |
| "grad_norm": 0.346915259777201, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.9384, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1688951442646024, | |
| "grad_norm": 0.17556784814479495, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.7181, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.211118930330753, | |
| "grad_norm": 0.13630834383254564, | |
| "learning_rate": 9.999451015497595e-06, | |
| "loss": 0.6024, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2533427163969036, | |
| "grad_norm": 0.11554222320406965, | |
| "learning_rate": 9.980249213076085e-06, | |
| "loss": 0.5901, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2955665024630542, | |
| "grad_norm": 0.12278253986244328, | |
| "learning_rate": 9.933718620186745e-06, | |
| "loss": 0.5266, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3377902885292048, | |
| "grad_norm": 0.09979826087347965, | |
| "learning_rate": 9.860114570402055e-06, | |
| "loss": 0.4992, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3800140745953554, | |
| "grad_norm": 0.09403541146186689, | |
| "learning_rate": 9.759840961111098e-06, | |
| "loss": 0.4552, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.422237860661506, | |
| "grad_norm": 0.11355916833002154, | |
| "learning_rate": 9.633448037159167e-06, | |
| "loss": 0.4481, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4644616467276566, | |
| "grad_norm": 0.08073377113874591, | |
| "learning_rate": 9.481629371415315e-06, | |
| "loss": 0.4421, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5066854327938072, | |
| "grad_norm": 0.09267587470901371, | |
| "learning_rate": 9.305218058836778e-06, | |
| "loss": 0.4315, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5489092188599578, | |
| "grad_norm": 0.07970275638417536, | |
| "learning_rate": 9.10518214491513e-06, | |
| "loss": 0.4193, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5911330049261084, | |
| "grad_norm": 0.09176650758741302, | |
| "learning_rate": 8.882619313590212e-06, | |
| "loss": 0.4359, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.633356790992259, | |
| "grad_norm": 0.06297292993950475, | |
| "learning_rate": 8.638750863781614e-06, | |
| "loss": 0.3866, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6755805770584096, | |
| "grad_norm": 0.09109073826112205, | |
| "learning_rate": 8.374915007591053e-06, | |
| "loss": 0.4041, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7178043631245602, | |
| "grad_norm": 0.05865190056091487, | |
| "learning_rate": 8.092559526951374e-06, | |
| "loss": 0.4239, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7600281491907108, | |
| "grad_norm": 0.06974442479960637, | |
| "learning_rate": 7.793233829018263e-06, | |
| "loss": 0.3986, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8022519352568613, | |
| "grad_norm": 0.05463034337178698, | |
| "learning_rate": 7.478580443900247e-06, | |
| "loss": 0.4247, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.844475721323012, | |
| "grad_norm": 0.06205362965112395, | |
| "learning_rate": 7.1503260113826035e-06, | |
| "loss": 0.3913, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8866995073891626, | |
| "grad_norm": 0.067892476771574, | |
| "learning_rate": 6.810271806104931e-06, | |
| "loss": 0.401, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9289232934553132, | |
| "grad_norm": 0.05154900712833624, | |
| "learning_rate": 6.46028385318488e-06, | |
| "loss": 0.3938, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9711470795214637, | |
| "grad_norm": 0.05021016889651189, | |
| "learning_rate": 6.10228268852786e-06, | |
| "loss": 0.3675, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.9964813511611541, | |
| "eval_loss": 0.36927565932273865, | |
| "eval_runtime": 345.1421, | |
| "eval_samples_per_second": 4.117, | |
| "eval_steps_per_second": 1.031, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.0133708655876144, | |
| "grad_norm": 0.06461727535494917, | |
| "learning_rate": 5.738232820012407e-06, | |
| "loss": 0.437, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0555946516537649, | |
| "grad_norm": 0.05489917224274474, | |
| "learning_rate": 5.370131947382215e-06, | |
| "loss": 0.3656, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0978184377199156, | |
| "grad_norm": 0.06042558978858904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4009, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.1400422237860661, | |
| "grad_norm": 0.06049921311472245, | |
| "learning_rate": 4.629868052617786e-06, | |
| "loss": 0.3482, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.1822660098522166, | |
| "grad_norm": 0.053628889597754824, | |
| "learning_rate": 4.261767179987595e-06, | |
| "loss": 0.3484, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2244897959183674, | |
| "grad_norm": 0.05427865015224349, | |
| "learning_rate": 3.897717311472141e-06, | |
| "loss": 0.3639, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.266713581984518, | |
| "grad_norm": 0.057489395038740544, | |
| "learning_rate": 3.539716146815122e-06, | |
| "loss": 0.3835, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.3089373680506686, | |
| "grad_norm": 0.05111391290442162, | |
| "learning_rate": 3.1897281938950693e-06, | |
| "loss": 0.3645, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.3511611541168191, | |
| "grad_norm": 0.05075774897048314, | |
| "learning_rate": 2.8496739886173994e-06, | |
| "loss": 0.3693, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3933849401829699, | |
| "grad_norm": 0.04943651400266673, | |
| "learning_rate": 2.5214195560997546e-06, | |
| "loss": 0.3786, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.4356087262491204, | |
| "grad_norm": 0.044263408717175816, | |
| "learning_rate": 2.2067661709817384e-06, | |
| "loss": 0.3802, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.477832512315271, | |
| "grad_norm": 0.05037201642103367, | |
| "learning_rate": 1.9074404730486264e-06, | |
| "loss": 0.3548, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.5200562983814216, | |
| "grad_norm": 0.047573069715430114, | |
| "learning_rate": 1.6250849924089485e-06, | |
| "loss": 0.3737, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.5622800844475722, | |
| "grad_norm": 0.05285111504365142, | |
| "learning_rate": 1.3612491362183887e-06, | |
| "loss": 0.3602, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.6045038705137227, | |
| "grad_norm": 0.05156216290861093, | |
| "learning_rate": 1.1173806864097885e-06, | |
| "loss": 0.3609, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.6467276565798734, | |
| "grad_norm": 0.04756228232260825, | |
| "learning_rate": 8.948178550848702e-07, | |
| "loss": 0.3344, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.688951442646024, | |
| "grad_norm": 0.05015333465574718, | |
| "learning_rate": 6.947819411632223e-07, | |
| "loss": 0.326, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.7311752287121744, | |
| "grad_norm": 0.05391623766653661, | |
| "learning_rate": 5.183706285846873e-07, | |
| "loss": 0.3743, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.7733990147783252, | |
| "grad_norm": 0.05381208848410942, | |
| "learning_rate": 3.665519628408332e-07, | |
| "loss": 0.3482, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.8156228008444757, | |
| "grad_norm": 0.04072660534703765, | |
| "learning_rate": 2.401590388889025e-07, | |
| "loss": 0.3681, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.8578465869106262, | |
| "grad_norm": 0.05078699788677098, | |
| "learning_rate": 1.3988542959794627e-07, | |
| "loss": 0.3522, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.900070372976777, | |
| "grad_norm": 0.05235725304027692, | |
| "learning_rate": 6.628137981325611e-08, | |
| "loss": 0.3994, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.9422941590429277, | |
| "grad_norm": 0.04983097587956561, | |
| "learning_rate": 1.975078692391552e-08, | |
| "loss": 0.36, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.984517945109078, | |
| "grad_norm": 0.058131705794752526, | |
| "learning_rate": 5.489845024053698e-10, | |
| "loss": 0.3467, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.9929627023223082, | |
| "eval_loss": 0.3553777039051056, | |
| "eval_runtime": 342.1435, | |
| "eval_samples_per_second": 4.153, | |
| "eval_steps_per_second": 1.04, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.9929627023223082, | |
| "step": 236, | |
| "total_flos": 4.237289220525261e+16, | |
| "train_loss": 0.47937014673726036, | |
| "train_runtime": 10453.3911, | |
| "train_samples_per_second": 1.087, | |
| "train_steps_per_second": 0.023 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 236, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.237289220525261e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |