512 lines
12 KiB
JSON
512 lines
12 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 8.0,
|
|
"eval_steps": 500,
|
|
"global_step": 40,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.48138627409935,
|
|
"epoch": 0.2,
|
|
"grad_norm": 348.0,
|
|
"learning_rate": 5e-05,
|
|
"loss": 7.8268,
|
|
"mean_token_accuracy": 0.4326018691062927,
|
|
"num_tokens": 642.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 1.6079905033111572,
|
|
"epoch": 0.4,
|
|
"grad_norm": 101.0,
|
|
"learning_rate": 5e-05,
|
|
"loss": 3.2045,
|
|
"mean_token_accuracy": 0.5241057276725769,
|
|
"num_tokens": 1289.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 2.244527578353882,
|
|
"epoch": 0.6,
|
|
"grad_norm": 46.25,
|
|
"learning_rate": 5e-05,
|
|
"loss": 2.0597,
|
|
"mean_token_accuracy": 0.6548536419868469,
|
|
"num_tokens": 1942.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 1.716873288154602,
|
|
"epoch": 0.8,
|
|
"grad_norm": 25.625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 1.3413,
|
|
"mean_token_accuracy": 0.801232635974884,
|
|
"num_tokens": 2595.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.9718767404556274,
|
|
"epoch": 1.0,
|
|
"grad_norm": 15.6875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.8459,
|
|
"mean_token_accuracy": 0.8649068474769592,
|
|
"num_tokens": 3243.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_entropy": 0.5460782845815023,
|
|
"eval_loss": 0.5847774744033813,
|
|
"eval_mean_token_accuracy": 0.8938564459482828,
|
|
"eval_num_tokens": 3243.0,
|
|
"eval_runtime": 1.1326,
|
|
"eval_samples_per_second": 17.658,
|
|
"eval_steps_per_second": 2.649,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.544474720954895,
|
|
"epoch": 1.2,
|
|
"grad_norm": 15.4375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.5279,
|
|
"mean_token_accuracy": 0.9024767875671387,
|
|
"num_tokens": 3893.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.4434409737586975,
|
|
"epoch": 1.4,
|
|
"grad_norm": 10.5,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.4652,
|
|
"mean_token_accuracy": 0.9174454808235168,
|
|
"num_tokens": 4539.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.4063136577606201,
|
|
"epoch": 1.6,
|
|
"grad_norm": 8.6875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.4188,
|
|
"mean_token_accuracy": 0.9212962985038757,
|
|
"num_tokens": 5191.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.39910462498664856,
|
|
"epoch": 1.8,
|
|
"grad_norm": 10.1875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.3337,
|
|
"mean_token_accuracy": 0.9297971725463867,
|
|
"num_tokens": 5836.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.3589059114456177,
|
|
"epoch": 2.0,
|
|
"grad_norm": 12.625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.3463,
|
|
"mean_token_accuracy": 0.9427244663238525,
|
|
"num_tokens": 6486.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_entropy": 0.3424152731895447,
|
|
"eval_loss": 0.39418846368789673,
|
|
"eval_mean_token_accuracy": 0.9389536182085673,
|
|
"eval_num_tokens": 6486.0,
|
|
"eval_runtime": 1.0565,
|
|
"eval_samples_per_second": 18.931,
|
|
"eval_steps_per_second": 2.84,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.3151477575302124,
|
|
"epoch": 2.2,
|
|
"grad_norm": 6.375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.2024,
|
|
"mean_token_accuracy": 0.9594383835792542,
|
|
"num_tokens": 7131.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.25139284133911133,
|
|
"epoch": 2.4,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.2078,
|
|
"mean_token_accuracy": 0.9701257944107056,
|
|
"num_tokens": 7771.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.21269312500953674,
|
|
"epoch": 2.6,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.1564,
|
|
"mean_token_accuracy": 0.969088077545166,
|
|
"num_tokens": 8422.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.22987733781337738,
|
|
"epoch": 2.8,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.1812,
|
|
"mean_token_accuracy": 0.9615384340286255,
|
|
"num_tokens": 9076.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.18663352727890015,
|
|
"epoch": 3.0,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.1414,
|
|
"mean_token_accuracy": 0.973805844783783,
|
|
"num_tokens": 9729.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_entropy": 0.211108868320783,
|
|
"eval_loss": 0.39385026693344116,
|
|
"eval_mean_token_accuracy": 0.9412751793861389,
|
|
"eval_num_tokens": 9729.0,
|
|
"eval_runtime": 1.0422,
|
|
"eval_samples_per_second": 19.19,
|
|
"eval_steps_per_second": 2.879,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.14480866491794586,
|
|
"epoch": 3.2,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0951,
|
|
"mean_token_accuracy": 0.9874411225318909,
|
|
"num_tokens": 10370.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.12099198997020721,
|
|
"epoch": 3.4,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.093,
|
|
"mean_token_accuracy": 0.9827855825424194,
|
|
"num_tokens": 11013.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.11832999438047409,
|
|
"epoch": 3.6,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0918,
|
|
"mean_token_accuracy": 0.9860681295394897,
|
|
"num_tokens": 11663.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.11181354522705078,
|
|
"epoch": 3.8,
|
|
"grad_norm": 15.5625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.1291,
|
|
"mean_token_accuracy": 0.9784283638000488,
|
|
"num_tokens": 12316.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.10412383824586868,
|
|
"epoch": 4.0,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.1079,
|
|
"mean_token_accuracy": 0.9815950989723206,
|
|
"num_tokens": 12972.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_entropy": 0.14945324758688608,
|
|
"eval_loss": 0.44532161951065063,
|
|
"eval_mean_token_accuracy": 0.9364051620165507,
|
|
"eval_num_tokens": 12972.0,
|
|
"eval_runtime": 1.113,
|
|
"eval_samples_per_second": 17.969,
|
|
"eval_steps_per_second": 2.695,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.10353338718414307,
|
|
"epoch": 4.2,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0847,
|
|
"mean_token_accuracy": 0.9832572340965271,
|
|
"num_tokens": 13633.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.09107775241136551,
|
|
"epoch": 4.4,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0628,
|
|
"mean_token_accuracy": 0.9858490824699402,
|
|
"num_tokens": 14273.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.09076139330863953,
|
|
"epoch": 4.6,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.071,
|
|
"mean_token_accuracy": 0.9828660488128662,
|
|
"num_tokens": 14919.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.08916156738996506,
|
|
"epoch": 4.8,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0971,
|
|
"mean_token_accuracy": 0.9782945513725281,
|
|
"num_tokens": 15568.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.08046303689479828,
|
|
"epoch": 5.0,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0696,
|
|
"mean_token_accuracy": 0.9875583052635193,
|
|
"num_tokens": 16215.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"eval_entropy": 0.15401589373747507,
|
|
"eval_loss": 0.3823298513889313,
|
|
"eval_mean_token_accuracy": 0.9413014054298401,
|
|
"eval_num_tokens": 16215.0,
|
|
"eval_runtime": 1.5592,
|
|
"eval_samples_per_second": 12.827,
|
|
"eval_steps_per_second": 1.924,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.08737289905548096,
|
|
"epoch": 5.2,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0528,
|
|
"mean_token_accuracy": 0.9844720363616943,
|
|
"num_tokens": 16863.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.08686110377311707,
|
|
"epoch": 5.4,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0477,
|
|
"mean_token_accuracy": 0.984375,
|
|
"num_tokens": 17507.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.09398575872182846,
|
|
"epoch": 5.6,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0548,
|
|
"mean_token_accuracy": 0.9782270789146423,
|
|
"num_tokens": 18154.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.06279423832893372,
|
|
"epoch": 5.8,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0385,
|
|
"mean_token_accuracy": 0.9844720363616943,
|
|
"num_tokens": 18802.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.06225178763270378,
|
|
"epoch": 6.0,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.044,
|
|
"mean_token_accuracy": 0.987730085849762,
|
|
"num_tokens": 19458.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"eval_entropy": 0.10515476514895757,
|
|
"eval_loss": 0.38724133372306824,
|
|
"eval_mean_token_accuracy": 0.9487539927164713,
|
|
"eval_num_tokens": 19458.0,
|
|
"eval_runtime": 1.2037,
|
|
"eval_samples_per_second": 16.615,
|
|
"eval_steps_per_second": 2.492,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.043452437967061996,
|
|
"epoch": 6.2,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0316,
|
|
"mean_token_accuracy": 0.9908257126808167,
|
|
"num_tokens": 20116.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.030422281473875046,
|
|
"epoch": 6.4,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0223,
|
|
"mean_token_accuracy": 0.9922118186950684,
|
|
"num_tokens": 20762.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.03276461362838745,
|
|
"epoch": 6.6,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0222,
|
|
"mean_token_accuracy": 0.9906542301177979,
|
|
"num_tokens": 21408.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.02641253173351288,
|
|
"epoch": 6.8,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0414,
|
|
"mean_token_accuracy": 0.9892141819000244,
|
|
"num_tokens": 22061.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.026881147176027298,
|
|
"epoch": 7.0,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0409,
|
|
"mean_token_accuracy": 0.9905660152435303,
|
|
"num_tokens": 22701.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"eval_entropy": 0.08066236476103465,
|
|
"eval_loss": 0.45111384987831116,
|
|
"eval_mean_token_accuracy": 0.9477067589759827,
|
|
"eval_num_tokens": 22701.0,
|
|
"eval_runtime": 1.1878,
|
|
"eval_samples_per_second": 16.837,
|
|
"eval_steps_per_second": 2.526,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.037951476871967316,
|
|
"epoch": 7.2,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0215,
|
|
"mean_token_accuracy": 0.9922480583190918,
|
|
"num_tokens": 23350.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.029273193329572678,
|
|
"epoch": 7.4,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0271,
|
|
"mean_token_accuracy": 0.9923076629638672,
|
|
"num_tokens": 24004.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.035928014665842056,
|
|
"epoch": 7.6,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0341,
|
|
"mean_token_accuracy": 0.9906250238418579,
|
|
"num_tokens": 24648.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.02118775062263012,
|
|
"epoch": 7.8,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0452,
|
|
"mean_token_accuracy": 0.9890795350074768,
|
|
"num_tokens": 25293.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.029543224722146988,
|
|
"epoch": 8.0,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0356,
|
|
"mean_token_accuracy": 0.9922720193862915,
|
|
"num_tokens": 25944.0,
|
|
"step": 40
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 40,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 8,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 15894048758784.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|