| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 72, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000125, | |
| "loss": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000175, | |
| "loss": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000196875, | |
| "loss": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019375000000000002, | |
| "loss": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000190625, | |
| "loss": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001875, | |
| "loss": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000184375, | |
| "loss": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018125000000000001, | |
| "loss": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000178125, | |
| "loss": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000175, | |
| "loss": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017187500000000002, | |
| "loss": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016875, | |
| "loss": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000165625, | |
| "loss": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016250000000000002, | |
| "loss": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000159375, | |
| "loss": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015625, | |
| "loss": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000153125, | |
| "loss": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000146875, | |
| "loss": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014375, | |
| "loss": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014062500000000002, | |
| "loss": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001375, | |
| "loss": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000134375, | |
| "loss": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013125000000000002, | |
| "loss": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000128125, | |
| "loss": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000125, | |
| "loss": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012187500000000001, | |
| "loss": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011875, | |
| "loss": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000115625, | |
| "loss": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011250000000000001, | |
| "loss": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000109375, | |
| "loss": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010625000000000001, | |
| "loss": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000103125, | |
| "loss": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.687500000000001e-05, | |
| "loss": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.375e-05, | |
| "loss": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.062500000000001e-05, | |
| "loss": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.75e-05, | |
| "loss": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.4375e-05, | |
| "loss": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.125000000000001e-05, | |
| "loss": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.8125e-05, | |
| "loss": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.1875e-05, | |
| "loss": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.875e-05, | |
| "loss": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.562500000000001e-05, | |
| "loss": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.9375e-05, | |
| "loss": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.6250000000000005e-05, | |
| "loss": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.3125000000000004e-05, | |
| "loss": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.375e-05, | |
| "loss": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.0625000000000005e-05, | |
| "loss": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.4375e-05, | |
| "loss": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.125e-05, | |
| "loss": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8125000000000003e-05, | |
| "loss": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1875e-05, | |
| "loss": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "step": 72 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 72, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "total_flos": 1791530237952.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |