{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.48138627409935, "epoch": 0.2, "grad_norm": 348.0, "learning_rate": 5e-05, "loss": 7.8268, "mean_token_accuracy": 0.4326018691062927, "num_tokens": 642.0, "step": 1 }, { "entropy": 1.6079905033111572, "epoch": 0.4, "grad_norm": 101.0, "learning_rate": 5e-05, "loss": 3.2045, "mean_token_accuracy": 0.5241057276725769, "num_tokens": 1289.0, "step": 2 }, { "entropy": 2.244527578353882, "epoch": 0.6, "grad_norm": 46.25, "learning_rate": 5e-05, "loss": 2.0597, "mean_token_accuracy": 0.6548536419868469, "num_tokens": 1942.0, "step": 3 }, { "entropy": 1.716873288154602, "epoch": 0.8, "grad_norm": 25.625, "learning_rate": 5e-05, "loss": 1.3413, "mean_token_accuracy": 0.801232635974884, "num_tokens": 2595.0, "step": 4 }, { "entropy": 0.9718767404556274, "epoch": 1.0, "grad_norm": 15.6875, "learning_rate": 5e-05, "loss": 0.8459, "mean_token_accuracy": 0.8649068474769592, "num_tokens": 3243.0, "step": 5 }, { "epoch": 1.0, "eval_entropy": 0.5460782845815023, "eval_loss": 0.5847774744033813, "eval_mean_token_accuracy": 0.8938564459482828, "eval_num_tokens": 3243.0, "eval_runtime": 1.1326, "eval_samples_per_second": 17.658, "eval_steps_per_second": 2.649, "step": 5 }, { "entropy": 0.544474720954895, "epoch": 1.2, "grad_norm": 15.4375, "learning_rate": 5e-05, "loss": 0.5279, "mean_token_accuracy": 0.9024767875671387, "num_tokens": 3893.0, "step": 6 }, { "entropy": 0.4434409737586975, "epoch": 1.4, "grad_norm": 10.5, "learning_rate": 5e-05, "loss": 0.4652, "mean_token_accuracy": 0.9174454808235168, "num_tokens": 4539.0, "step": 7 }, { "entropy": 0.4063136577606201, "epoch": 1.6, "grad_norm": 8.6875, "learning_rate": 5e-05, "loss": 0.4188, "mean_token_accuracy": 0.9212962985038757, "num_tokens": 5191.0, "step": 8 }, { "entropy": 0.39910462498664856, "epoch": 1.8, "grad_norm": 10.1875, "learning_rate": 5e-05, "loss": 0.3337, "mean_token_accuracy": 0.9297971725463867, "num_tokens": 5836.0, "step": 9 }, { "entropy": 0.3589059114456177, "epoch": 2.0, "grad_norm": 12.625, "learning_rate": 5e-05, "loss": 0.3463, "mean_token_accuracy": 0.9427244663238525, "num_tokens": 6486.0, "step": 10 }, { "epoch": 2.0, "eval_entropy": 0.3424152731895447, "eval_loss": 0.39418846368789673, "eval_mean_token_accuracy": 0.9389536182085673, "eval_num_tokens": 6486.0, "eval_runtime": 1.0565, "eval_samples_per_second": 18.931, "eval_steps_per_second": 2.84, "step": 10 }, { "entropy": 0.3151477575302124, "epoch": 2.2, "grad_norm": 6.375, "learning_rate": 5e-05, "loss": 0.2024, "mean_token_accuracy": 0.9594383835792542, "num_tokens": 7131.0, "step": 11 }, { "entropy": 0.25139284133911133, "epoch": 2.4, "grad_norm": 5.625, "learning_rate": 5e-05, "loss": 0.2078, "mean_token_accuracy": 0.9701257944107056, "num_tokens": 7771.0, "step": 12 }, { "entropy": 0.21269312500953674, "epoch": 2.6, "grad_norm": 5.84375, "learning_rate": 5e-05, "loss": 0.1564, "mean_token_accuracy": 0.969088077545166, "num_tokens": 8422.0, "step": 13 }, { "entropy": 0.22987733781337738, "epoch": 2.8, "grad_norm": 6.03125, "learning_rate": 5e-05, "loss": 0.1812, "mean_token_accuracy": 0.9615384340286255, "num_tokens": 9076.0, "step": 14 }, { "entropy": 0.18663352727890015, "epoch": 3.0, "grad_norm": 4.4375, "learning_rate": 5e-05, "loss": 0.1414, "mean_token_accuracy": 0.973805844783783, "num_tokens": 9729.0, "step": 15 }, { "epoch": 3.0, "eval_entropy": 0.211108868320783, "eval_loss": 0.39385026693344116, "eval_mean_token_accuracy": 0.9412751793861389, "eval_num_tokens": 9729.0, "eval_runtime": 1.0422, "eval_samples_per_second": 19.19, "eval_steps_per_second": 2.879, "step": 15 }, { "entropy": 0.14480866491794586, "epoch": 3.2, "grad_norm": 3.234375, "learning_rate": 5e-05, "loss": 0.0951, "mean_token_accuracy": 0.9874411225318909, "num_tokens": 10370.0, "step": 16 }, { "entropy": 0.12099198997020721, "epoch": 3.4, "grad_norm": 3.96875, "learning_rate": 5e-05, "loss": 0.093, "mean_token_accuracy": 0.9827855825424194, "num_tokens": 11013.0, "step": 17 }, { "entropy": 0.11832999438047409, "epoch": 3.6, "grad_norm": 4.375, "learning_rate": 5e-05, "loss": 0.0918, "mean_token_accuracy": 0.9860681295394897, "num_tokens": 11663.0, "step": 18 }, { "entropy": 0.11181354522705078, "epoch": 3.8, "grad_norm": 15.5625, "learning_rate": 5e-05, "loss": 0.1291, "mean_token_accuracy": 0.9784283638000488, "num_tokens": 12316.0, "step": 19 }, { "entropy": 0.10412383824586868, "epoch": 4.0, "grad_norm": 4.96875, "learning_rate": 5e-05, "loss": 0.1079, "mean_token_accuracy": 0.9815950989723206, "num_tokens": 12972.0, "step": 20 }, { "epoch": 4.0, "eval_entropy": 0.14945324758688608, "eval_loss": 0.44532161951065063, "eval_mean_token_accuracy": 0.9364051620165507, "eval_num_tokens": 12972.0, "eval_runtime": 1.113, "eval_samples_per_second": 17.969, "eval_steps_per_second": 2.695, "step": 20 }, { "entropy": 0.10353338718414307, "epoch": 4.2, "grad_norm": 3.875, "learning_rate": 5e-05, "loss": 0.0847, "mean_token_accuracy": 0.9832572340965271, "num_tokens": 13633.0, "step": 21 }, { "entropy": 0.09107775241136551, "epoch": 4.4, "grad_norm": 2.734375, "learning_rate": 5e-05, "loss": 0.0628, "mean_token_accuracy": 0.9858490824699402, "num_tokens": 14273.0, "step": 22 }, { "entropy": 0.09076139330863953, "epoch": 4.6, "grad_norm": 2.53125, "learning_rate": 5e-05, "loss": 0.071, "mean_token_accuracy": 0.9828660488128662, "num_tokens": 14919.0, "step": 23 }, { "entropy": 0.08916156738996506, "epoch": 4.8, "grad_norm": 4.28125, "learning_rate": 5e-05, "loss": 0.0971, "mean_token_accuracy": 0.9782945513725281, "num_tokens": 15568.0, "step": 24 }, { "entropy": 0.08046303689479828, "epoch": 5.0, "grad_norm": 2.3125, "learning_rate": 5e-05, "loss": 0.0696, "mean_token_accuracy": 0.9875583052635193, "num_tokens": 16215.0, "step": 25 }, { "epoch": 5.0, "eval_entropy": 0.15401589373747507, "eval_loss": 0.3823298513889313, "eval_mean_token_accuracy": 0.9413014054298401, "eval_num_tokens": 16215.0, "eval_runtime": 1.5592, "eval_samples_per_second": 12.827, "eval_steps_per_second": 1.924, "step": 25 }, { "entropy": 0.08737289905548096, "epoch": 5.2, "grad_norm": 1.546875, "learning_rate": 5e-05, "loss": 0.0528, "mean_token_accuracy": 0.9844720363616943, "num_tokens": 16863.0, "step": 26 }, { "entropy": 0.08686110377311707, "epoch": 5.4, "grad_norm": 1.8515625, "learning_rate": 5e-05, "loss": 0.0477, "mean_token_accuracy": 0.984375, "num_tokens": 17507.0, "step": 27 }, { "entropy": 0.09398575872182846, "epoch": 5.6, "grad_norm": 2.828125, "learning_rate": 5e-05, "loss": 0.0548, "mean_token_accuracy": 0.9782270789146423, "num_tokens": 18154.0, "step": 28 }, { "entropy": 0.06279423832893372, "epoch": 5.8, "grad_norm": 1.90625, "learning_rate": 5e-05, "loss": 0.0385, "mean_token_accuracy": 0.9844720363616943, "num_tokens": 18802.0, "step": 29 }, { "entropy": 0.06225178763270378, "epoch": 6.0, "grad_norm": 2.28125, "learning_rate": 5e-05, "loss": 0.044, "mean_token_accuracy": 0.987730085849762, "num_tokens": 19458.0, "step": 30 }, { "epoch": 6.0, "eval_entropy": 0.10515476514895757, "eval_loss": 0.38724133372306824, "eval_mean_token_accuracy": 0.9487539927164713, "eval_num_tokens": 19458.0, "eval_runtime": 1.2037, "eval_samples_per_second": 16.615, "eval_steps_per_second": 2.492, "step": 30 }, { "entropy": 0.043452437967061996, "epoch": 6.2, "grad_norm": 1.734375, "learning_rate": 5e-05, "loss": 0.0316, "mean_token_accuracy": 0.9908257126808167, "num_tokens": 20116.0, "step": 31 }, { "entropy": 0.030422281473875046, "epoch": 6.4, "grad_norm": 1.3125, "learning_rate": 5e-05, "loss": 0.0223, "mean_token_accuracy": 0.9922118186950684, "num_tokens": 20762.0, "step": 32 }, { "entropy": 0.03276461362838745, "epoch": 6.6, "grad_norm": 1.2421875, "learning_rate": 5e-05, "loss": 0.0222, "mean_token_accuracy": 0.9906542301177979, "num_tokens": 21408.0, "step": 33 }, { "entropy": 0.02641253173351288, "epoch": 6.8, "grad_norm": 2.15625, "learning_rate": 5e-05, "loss": 0.0414, "mean_token_accuracy": 0.9892141819000244, "num_tokens": 22061.0, "step": 34 }, { "entropy": 0.026881147176027298, "epoch": 7.0, "grad_norm": 1.6484375, "learning_rate": 5e-05, "loss": 0.0409, "mean_token_accuracy": 0.9905660152435303, "num_tokens": 22701.0, "step": 35 }, { "epoch": 7.0, "eval_entropy": 0.08066236476103465, "eval_loss": 0.45111384987831116, "eval_mean_token_accuracy": 0.9477067589759827, "eval_num_tokens": 22701.0, "eval_runtime": 1.1878, "eval_samples_per_second": 16.837, "eval_steps_per_second": 2.526, "step": 35 }, { "entropy": 0.037951476871967316, "epoch": 7.2, "grad_norm": 1.125, "learning_rate": 5e-05, "loss": 0.0215, "mean_token_accuracy": 0.9922480583190918, "num_tokens": 23350.0, "step": 36 }, { "entropy": 0.029273193329572678, "epoch": 7.4, "grad_norm": 1.28125, "learning_rate": 5e-05, "loss": 0.0271, "mean_token_accuracy": 0.9923076629638672, "num_tokens": 24004.0, "step": 37 }, { "entropy": 0.035928014665842056, "epoch": 7.6, "grad_norm": 1.5859375, "learning_rate": 5e-05, "loss": 0.0341, "mean_token_accuracy": 0.9906250238418579, "num_tokens": 24648.0, "step": 38 }, { "entropy": 0.02118775062263012, "epoch": 7.8, "grad_norm": 1.8828125, "learning_rate": 5e-05, "loss": 0.0452, "mean_token_accuracy": 0.9890795350074768, "num_tokens": 25293.0, "step": 39 }, { "entropy": 0.029543224722146988, "epoch": 8.0, "grad_norm": 1.4453125, "learning_rate": 5e-05, "loss": 0.0356, "mean_token_accuracy": 0.9922720193862915, "num_tokens": 25944.0, "step": 40 } ], "logging_steps": 1, "max_steps": 40, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 15894048758784.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }