Files
gemmafc/checkpoint-40/trainer_state.json
2025-12-23 16:53:06 +01:00

512 lines
12 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 500,
"global_step": 40,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.48138627409935,
"epoch": 0.2,
"grad_norm": 348.0,
"learning_rate": 5e-05,
"loss": 7.8268,
"mean_token_accuracy": 0.4326018691062927,
"num_tokens": 642.0,
"step": 1
},
{
"entropy": 1.6079905033111572,
"epoch": 0.4,
"grad_norm": 101.0,
"learning_rate": 5e-05,
"loss": 3.2045,
"mean_token_accuracy": 0.5241057276725769,
"num_tokens": 1289.0,
"step": 2
},
{
"entropy": 2.244527578353882,
"epoch": 0.6,
"grad_norm": 46.25,
"learning_rate": 5e-05,
"loss": 2.0597,
"mean_token_accuracy": 0.6548536419868469,
"num_tokens": 1942.0,
"step": 3
},
{
"entropy": 1.716873288154602,
"epoch": 0.8,
"grad_norm": 25.625,
"learning_rate": 5e-05,
"loss": 1.3413,
"mean_token_accuracy": 0.801232635974884,
"num_tokens": 2595.0,
"step": 4
},
{
"entropy": 0.9718767404556274,
"epoch": 1.0,
"grad_norm": 15.6875,
"learning_rate": 5e-05,
"loss": 0.8459,
"mean_token_accuracy": 0.8649068474769592,
"num_tokens": 3243.0,
"step": 5
},
{
"epoch": 1.0,
"eval_entropy": 0.5460782845815023,
"eval_loss": 0.5847774744033813,
"eval_mean_token_accuracy": 0.8938564459482828,
"eval_num_tokens": 3243.0,
"eval_runtime": 1.1326,
"eval_samples_per_second": 17.658,
"eval_steps_per_second": 2.649,
"step": 5
},
{
"entropy": 0.544474720954895,
"epoch": 1.2,
"grad_norm": 15.4375,
"learning_rate": 5e-05,
"loss": 0.5279,
"mean_token_accuracy": 0.9024767875671387,
"num_tokens": 3893.0,
"step": 6
},
{
"entropy": 0.4434409737586975,
"epoch": 1.4,
"grad_norm": 10.5,
"learning_rate": 5e-05,
"loss": 0.4652,
"mean_token_accuracy": 0.9174454808235168,
"num_tokens": 4539.0,
"step": 7
},
{
"entropy": 0.4063136577606201,
"epoch": 1.6,
"grad_norm": 8.6875,
"learning_rate": 5e-05,
"loss": 0.4188,
"mean_token_accuracy": 0.9212962985038757,
"num_tokens": 5191.0,
"step": 8
},
{
"entropy": 0.39910462498664856,
"epoch": 1.8,
"grad_norm": 10.1875,
"learning_rate": 5e-05,
"loss": 0.3337,
"mean_token_accuracy": 0.9297971725463867,
"num_tokens": 5836.0,
"step": 9
},
{
"entropy": 0.3589059114456177,
"epoch": 2.0,
"grad_norm": 12.625,
"learning_rate": 5e-05,
"loss": 0.3463,
"mean_token_accuracy": 0.9427244663238525,
"num_tokens": 6486.0,
"step": 10
},
{
"epoch": 2.0,
"eval_entropy": 0.3424152731895447,
"eval_loss": 0.39418846368789673,
"eval_mean_token_accuracy": 0.9389536182085673,
"eval_num_tokens": 6486.0,
"eval_runtime": 1.0565,
"eval_samples_per_second": 18.931,
"eval_steps_per_second": 2.84,
"step": 10
},
{
"entropy": 0.3151477575302124,
"epoch": 2.2,
"grad_norm": 6.375,
"learning_rate": 5e-05,
"loss": 0.2024,
"mean_token_accuracy": 0.9594383835792542,
"num_tokens": 7131.0,
"step": 11
},
{
"entropy": 0.25139284133911133,
"epoch": 2.4,
"grad_norm": 5.625,
"learning_rate": 5e-05,
"loss": 0.2078,
"mean_token_accuracy": 0.9701257944107056,
"num_tokens": 7771.0,
"step": 12
},
{
"entropy": 0.21269312500953674,
"epoch": 2.6,
"grad_norm": 5.84375,
"learning_rate": 5e-05,
"loss": 0.1564,
"mean_token_accuracy": 0.969088077545166,
"num_tokens": 8422.0,
"step": 13
},
{
"entropy": 0.22987733781337738,
"epoch": 2.8,
"grad_norm": 6.03125,
"learning_rate": 5e-05,
"loss": 0.1812,
"mean_token_accuracy": 0.9615384340286255,
"num_tokens": 9076.0,
"step": 14
},
{
"entropy": 0.18663352727890015,
"epoch": 3.0,
"grad_norm": 4.4375,
"learning_rate": 5e-05,
"loss": 0.1414,
"mean_token_accuracy": 0.973805844783783,
"num_tokens": 9729.0,
"step": 15
},
{
"epoch": 3.0,
"eval_entropy": 0.211108868320783,
"eval_loss": 0.39385026693344116,
"eval_mean_token_accuracy": 0.9412751793861389,
"eval_num_tokens": 9729.0,
"eval_runtime": 1.0422,
"eval_samples_per_second": 19.19,
"eval_steps_per_second": 2.879,
"step": 15
},
{
"entropy": 0.14480866491794586,
"epoch": 3.2,
"grad_norm": 3.234375,
"learning_rate": 5e-05,
"loss": 0.0951,
"mean_token_accuracy": 0.9874411225318909,
"num_tokens": 10370.0,
"step": 16
},
{
"entropy": 0.12099198997020721,
"epoch": 3.4,
"grad_norm": 3.96875,
"learning_rate": 5e-05,
"loss": 0.093,
"mean_token_accuracy": 0.9827855825424194,
"num_tokens": 11013.0,
"step": 17
},
{
"entropy": 0.11832999438047409,
"epoch": 3.6,
"grad_norm": 4.375,
"learning_rate": 5e-05,
"loss": 0.0918,
"mean_token_accuracy": 0.9860681295394897,
"num_tokens": 11663.0,
"step": 18
},
{
"entropy": 0.11181354522705078,
"epoch": 3.8,
"grad_norm": 15.5625,
"learning_rate": 5e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9784283638000488,
"num_tokens": 12316.0,
"step": 19
},
{
"entropy": 0.10412383824586868,
"epoch": 4.0,
"grad_norm": 4.96875,
"learning_rate": 5e-05,
"loss": 0.1079,
"mean_token_accuracy": 0.9815950989723206,
"num_tokens": 12972.0,
"step": 20
},
{
"epoch": 4.0,
"eval_entropy": 0.14945324758688608,
"eval_loss": 0.44532161951065063,
"eval_mean_token_accuracy": 0.9364051620165507,
"eval_num_tokens": 12972.0,
"eval_runtime": 1.113,
"eval_samples_per_second": 17.969,
"eval_steps_per_second": 2.695,
"step": 20
},
{
"entropy": 0.10353338718414307,
"epoch": 4.2,
"grad_norm": 3.875,
"learning_rate": 5e-05,
"loss": 0.0847,
"mean_token_accuracy": 0.9832572340965271,
"num_tokens": 13633.0,
"step": 21
},
{
"entropy": 0.09107775241136551,
"epoch": 4.4,
"grad_norm": 2.734375,
"learning_rate": 5e-05,
"loss": 0.0628,
"mean_token_accuracy": 0.9858490824699402,
"num_tokens": 14273.0,
"step": 22
},
{
"entropy": 0.09076139330863953,
"epoch": 4.6,
"grad_norm": 2.53125,
"learning_rate": 5e-05,
"loss": 0.071,
"mean_token_accuracy": 0.9828660488128662,
"num_tokens": 14919.0,
"step": 23
},
{
"entropy": 0.08916156738996506,
"epoch": 4.8,
"grad_norm": 4.28125,
"learning_rate": 5e-05,
"loss": 0.0971,
"mean_token_accuracy": 0.9782945513725281,
"num_tokens": 15568.0,
"step": 24
},
{
"entropy": 0.08046303689479828,
"epoch": 5.0,
"grad_norm": 2.3125,
"learning_rate": 5e-05,
"loss": 0.0696,
"mean_token_accuracy": 0.9875583052635193,
"num_tokens": 16215.0,
"step": 25
},
{
"epoch": 5.0,
"eval_entropy": 0.15401589373747507,
"eval_loss": 0.3823298513889313,
"eval_mean_token_accuracy": 0.9413014054298401,
"eval_num_tokens": 16215.0,
"eval_runtime": 1.5592,
"eval_samples_per_second": 12.827,
"eval_steps_per_second": 1.924,
"step": 25
},
{
"entropy": 0.08737289905548096,
"epoch": 5.2,
"grad_norm": 1.546875,
"learning_rate": 5e-05,
"loss": 0.0528,
"mean_token_accuracy": 0.9844720363616943,
"num_tokens": 16863.0,
"step": 26
},
{
"entropy": 0.08686110377311707,
"epoch": 5.4,
"grad_norm": 1.8515625,
"learning_rate": 5e-05,
"loss": 0.0477,
"mean_token_accuracy": 0.984375,
"num_tokens": 17507.0,
"step": 27
},
{
"entropy": 0.09398575872182846,
"epoch": 5.6,
"grad_norm": 2.828125,
"learning_rate": 5e-05,
"loss": 0.0548,
"mean_token_accuracy": 0.9782270789146423,
"num_tokens": 18154.0,
"step": 28
},
{
"entropy": 0.06279423832893372,
"epoch": 5.8,
"grad_norm": 1.90625,
"learning_rate": 5e-05,
"loss": 0.0385,
"mean_token_accuracy": 0.9844720363616943,
"num_tokens": 18802.0,
"step": 29
},
{
"entropy": 0.06225178763270378,
"epoch": 6.0,
"grad_norm": 2.28125,
"learning_rate": 5e-05,
"loss": 0.044,
"mean_token_accuracy": 0.987730085849762,
"num_tokens": 19458.0,
"step": 30
},
{
"epoch": 6.0,
"eval_entropy": 0.10515476514895757,
"eval_loss": 0.38724133372306824,
"eval_mean_token_accuracy": 0.9487539927164713,
"eval_num_tokens": 19458.0,
"eval_runtime": 1.2037,
"eval_samples_per_second": 16.615,
"eval_steps_per_second": 2.492,
"step": 30
},
{
"entropy": 0.043452437967061996,
"epoch": 6.2,
"grad_norm": 1.734375,
"learning_rate": 5e-05,
"loss": 0.0316,
"mean_token_accuracy": 0.9908257126808167,
"num_tokens": 20116.0,
"step": 31
},
{
"entropy": 0.030422281473875046,
"epoch": 6.4,
"grad_norm": 1.3125,
"learning_rate": 5e-05,
"loss": 0.0223,
"mean_token_accuracy": 0.9922118186950684,
"num_tokens": 20762.0,
"step": 32
},
{
"entropy": 0.03276461362838745,
"epoch": 6.6,
"grad_norm": 1.2421875,
"learning_rate": 5e-05,
"loss": 0.0222,
"mean_token_accuracy": 0.9906542301177979,
"num_tokens": 21408.0,
"step": 33
},
{
"entropy": 0.02641253173351288,
"epoch": 6.8,
"grad_norm": 2.15625,
"learning_rate": 5e-05,
"loss": 0.0414,
"mean_token_accuracy": 0.9892141819000244,
"num_tokens": 22061.0,
"step": 34
},
{
"entropy": 0.026881147176027298,
"epoch": 7.0,
"grad_norm": 1.6484375,
"learning_rate": 5e-05,
"loss": 0.0409,
"mean_token_accuracy": 0.9905660152435303,
"num_tokens": 22701.0,
"step": 35
},
{
"epoch": 7.0,
"eval_entropy": 0.08066236476103465,
"eval_loss": 0.45111384987831116,
"eval_mean_token_accuracy": 0.9477067589759827,
"eval_num_tokens": 22701.0,
"eval_runtime": 1.1878,
"eval_samples_per_second": 16.837,
"eval_steps_per_second": 2.526,
"step": 35
},
{
"entropy": 0.037951476871967316,
"epoch": 7.2,
"grad_norm": 1.125,
"learning_rate": 5e-05,
"loss": 0.0215,
"mean_token_accuracy": 0.9922480583190918,
"num_tokens": 23350.0,
"step": 36
},
{
"entropy": 0.029273193329572678,
"epoch": 7.4,
"grad_norm": 1.28125,
"learning_rate": 5e-05,
"loss": 0.0271,
"mean_token_accuracy": 0.9923076629638672,
"num_tokens": 24004.0,
"step": 37
},
{
"entropy": 0.035928014665842056,
"epoch": 7.6,
"grad_norm": 1.5859375,
"learning_rate": 5e-05,
"loss": 0.0341,
"mean_token_accuracy": 0.9906250238418579,
"num_tokens": 24648.0,
"step": 38
},
{
"entropy": 0.02118775062263012,
"epoch": 7.8,
"grad_norm": 1.8828125,
"learning_rate": 5e-05,
"loss": 0.0452,
"mean_token_accuracy": 0.9890795350074768,
"num_tokens": 25293.0,
"step": 39
},
{
"entropy": 0.029543224722146988,
"epoch": 8.0,
"grad_norm": 1.4453125,
"learning_rate": 5e-05,
"loss": 0.0356,
"mean_token_accuracy": 0.9922720193862915,
"num_tokens": 25944.0,
"step": 40
}
],
"logging_steps": 1,
"max_steps": 40,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 15894048758784.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}