Reinforcement Learning
MLX
Safetensors
English
qwen2_5_vl
IQA
Reasoning
VLM
Pytorch
R1
GRPO
RL2R
4-bit precision
Instructions to use mlx-community/VisualQuality-R1-7B-4bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mlx-community/VisualQuality-R1-7B-4bit with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir VisualQuality-R1-7B-4bit mlx-community/VisualQuality-R1-7B-4bit
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 870, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.88542175292969, | |
| "epoch": 0.011494252873563218, | |
| "grad_norm": 1.3213221828105846, | |
| "kl": 0.0, | |
| "learning_rate": 9.988505747126437e-07, | |
| "loss": 0.0, | |
| "reward": 1.8197596073150635, | |
| "reward_std": 0.06271697580814362, | |
| "rewards/accuracy_reward": 0.8243168592453003, | |
| "rewards/format_reward": 0.9954427480697632, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.89388275146484, | |
| "epoch": 0.022988505747126436, | |
| "grad_norm": 1.006446520082249, | |
| "kl": 0.000926971435546875, | |
| "learning_rate": 9.977011494252872e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7921943664550781, | |
| "reward_std": 0.06994771957397461, | |
| "rewards/accuracy_reward": 0.7961006164550781, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.96745300292969, | |
| "epoch": 0.034482758620689655, | |
| "grad_norm": 1.4023786331817765, | |
| "kl": 0.0013885498046875, | |
| "learning_rate": 9.96551724137931e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8005397319793701, | |
| "reward_std": 0.06632152944803238, | |
| "rewards/accuracy_reward": 0.8037948608398438, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.48958587646484, | |
| "epoch": 0.04597701149425287, | |
| "grad_norm": 1.0375104386573222, | |
| "kl": 0.00238037109375, | |
| "learning_rate": 9.954022988505747e-07, | |
| "loss": 0.0001, | |
| "reward": 1.801190733909607, | |
| "reward_std": 0.06357432901859283, | |
| "rewards/accuracy_reward": 0.8044459819793701, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.017578125, | |
| "epoch": 0.05747126436781609, | |
| "grad_norm": 1.2136231517307197, | |
| "kl": 0.0023956298828125, | |
| "learning_rate": 9.942528735632182e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7834184169769287, | |
| "reward_std": 0.06750813126564026, | |
| "rewards/accuracy_reward": 0.7886267900466919, | |
| "rewards/format_reward": 0.9947916865348816, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.845703125, | |
| "epoch": 0.06896551724137931, | |
| "grad_norm": 1.9568797025276756, | |
| "kl": 0.003631591796875, | |
| "learning_rate": 9.93103448275862e-07, | |
| "loss": 0.0001, | |
| "reward": 1.816709280014038, | |
| "reward_std": 0.06217224523425102, | |
| "rewards/accuracy_reward": 0.8199643492698669, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.53450775146484, | |
| "epoch": 0.08045977011494253, | |
| "grad_norm": 1.0517811222524571, | |
| "kl": 0.00457763671875, | |
| "learning_rate": 9.919540229885057e-07, | |
| "loss": 0.0002, | |
| "reward": 1.8246958255767822, | |
| "reward_std": 0.05271482467651367, | |
| "rewards/accuracy_reward": 0.8246957063674927, | |
| "rewards/format_reward": 1.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.068359375, | |
| "epoch": 0.09195402298850575, | |
| "grad_norm": 1.1645981671220067, | |
| "kl": 0.004638671875, | |
| "learning_rate": 9.908045977011493e-07, | |
| "loss": 0.0002, | |
| "reward": 1.8276481628417969, | |
| "reward_std": 0.05832577869296074, | |
| "rewards/accuracy_reward": 0.8296012878417969, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.41471862792969, | |
| "epoch": 0.10344827586206896, | |
| "grad_norm": 1.323752481080967, | |
| "kl": 0.0045166015625, | |
| "learning_rate": 9.89655172413793e-07, | |
| "loss": 0.0002, | |
| "reward": 1.7714653015136719, | |
| "reward_std": 0.05968251824378967, | |
| "rewards/accuracy_reward": 0.7734185457229614, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.76042175292969, | |
| "epoch": 0.11494252873563218, | |
| "grad_norm": 1.412974502439727, | |
| "kl": 0.00445556640625, | |
| "learning_rate": 9.885057471264367e-07, | |
| "loss": 0.0002, | |
| "reward": 1.8277686834335327, | |
| "reward_std": 0.05361395701766014, | |
| "rewards/accuracy_reward": 0.8284196853637695, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.87630462646484, | |
| "epoch": 0.12643678160919541, | |
| "grad_norm": 2.075304847500042, | |
| "kl": 0.00518798828125, | |
| "learning_rate": 9.873563218390805e-07, | |
| "loss": 0.0002, | |
| "reward": 1.8043527603149414, | |
| "reward_std": 0.05421295762062073, | |
| "rewards/accuracy_reward": 0.805003821849823, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.30989837646484, | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 1.4906362546711622, | |
| "kl": 0.005401611328125, | |
| "learning_rate": 9.86206896551724e-07, | |
| "loss": 0.0002, | |
| "reward": 1.804455280303955, | |
| "reward_std": 0.054727546870708466, | |
| "rewards/accuracy_reward": 0.805106520652771, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.990234375, | |
| "epoch": 0.14942528735632185, | |
| "grad_norm": 1.5774307568902088, | |
| "kl": 0.007354736328125, | |
| "learning_rate": 9.850574712643678e-07, | |
| "loss": 0.0003, | |
| "reward": 1.8098227977752686, | |
| "reward_std": 0.05695592984557152, | |
| "rewards/accuracy_reward": 0.8098229169845581, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.994140625, | |
| "epoch": 0.16091954022988506, | |
| "grad_norm": 1.1798175513976064, | |
| "kl": 0.00909423828125, | |
| "learning_rate": 9.839080459770115e-07, | |
| "loss": 0.0004, | |
| "reward": 1.804555892944336, | |
| "reward_std": 0.054406534880399704, | |
| "rewards/accuracy_reward": 0.8045558929443359, | |
| "rewards/format_reward": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.54622650146484, | |
| "epoch": 0.1724137931034483, | |
| "grad_norm": 1.4179734216380988, | |
| "kl": 0.00946044921875, | |
| "learning_rate": 9.82758620689655e-07, | |
| "loss": 0.0004, | |
| "reward": 1.842616081237793, | |
| "reward_std": 0.04714566469192505, | |
| "rewards/accuracy_reward": 0.8432672023773193, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.39974212646484, | |
| "epoch": 0.1839080459770115, | |
| "grad_norm": 4.716568487745124, | |
| "kl": 0.01055908203125, | |
| "learning_rate": 9.816091954022988e-07, | |
| "loss": 0.0004, | |
| "reward": 1.7991819381713867, | |
| "reward_std": 0.05012369155883789, | |
| "rewards/accuracy_reward": 0.7991819381713867, | |
| "rewards/format_reward": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.38216400146484, | |
| "epoch": 0.19540229885057472, | |
| "grad_norm": 1.288358836642226, | |
| "kl": 0.01361083984375, | |
| "learning_rate": 9.804597701149425e-07, | |
| "loss": 0.0006, | |
| "reward": 1.8223531246185303, | |
| "reward_std": 0.05100415274500847, | |
| "rewards/accuracy_reward": 0.8223528861999512, | |
| "rewards/format_reward": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.85612487792969, | |
| "epoch": 0.20689655172413793, | |
| "grad_norm": 1.4234083835613787, | |
| "kl": 0.01214599609375, | |
| "learning_rate": 9.79310344827586e-07, | |
| "loss": 0.0005, | |
| "reward": 1.8194246292114258, | |
| "reward_std": 0.04791136831045151, | |
| "rewards/accuracy_reward": 0.8200756311416626, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.123046875, | |
| "epoch": 0.21839080459770116, | |
| "grad_norm": 1.4919735757668362, | |
| "kl": 0.0126953125, | |
| "learning_rate": 9.781609195402298e-07, | |
| "loss": 0.0006, | |
| "reward": 1.7989381551742554, | |
| "reward_std": 0.05323593318462372, | |
| "rewards/accuracy_reward": 0.7989381551742554, | |
| "rewards/format_reward": 1.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.22917175292969, | |
| "epoch": 0.22988505747126436, | |
| "grad_norm": 1.149828657266046, | |
| "kl": 0.012939453125, | |
| "learning_rate": 9.770114942528735e-07, | |
| "loss": 0.0006, | |
| "reward": 1.8272727727890015, | |
| "reward_std": 0.04600293189287186, | |
| "rewards/accuracy_reward": 0.8272727727890015, | |
| "rewards/format_reward": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.38411712646484, | |
| "epoch": 0.2413793103448276, | |
| "grad_norm": 4.856552679666189, | |
| "kl": 0.01416015625, | |
| "learning_rate": 9.75862068965517e-07, | |
| "loss": 0.0006, | |
| "reward": 1.8167914152145386, | |
| "reward_std": 0.04727660119533539, | |
| "rewards/accuracy_reward": 0.816791296005249, | |
| "rewards/format_reward": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.02734375, | |
| "epoch": 0.25287356321839083, | |
| "grad_norm": 1.4678458148251887, | |
| "kl": 0.0172119140625, | |
| "learning_rate": 9.747126436781608e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8238710165023804, | |
| "reward_std": 0.04785631224513054, | |
| "rewards/accuracy_reward": 0.8245220184326172, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.50521087646484, | |
| "epoch": 0.26436781609195403, | |
| "grad_norm": 2.388579996164326, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 9.735632183908046e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8233423233032227, | |
| "reward_std": 0.04244537279009819, | |
| "rewards/accuracy_reward": 0.8233422040939331, | |
| "rewards/format_reward": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.33138275146484, | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 1.5276786773430984, | |
| "kl": 0.01806640625, | |
| "learning_rate": 9.72413793103448e-07, | |
| "loss": 0.0008, | |
| "reward": 1.837747573852539, | |
| "reward_std": 0.041725195944309235, | |
| "rewards/accuracy_reward": 0.8383986353874207, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.095703125, | |
| "epoch": 0.28735632183908044, | |
| "grad_norm": 2.321983309381386, | |
| "kl": 0.0213623046875, | |
| "learning_rate": 9.712643678160918e-07, | |
| "loss": 0.0009, | |
| "reward": 1.7901400327682495, | |
| "reward_std": 0.04634283110499382, | |
| "rewards/accuracy_reward": 0.7901400923728943, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.55859375, | |
| "epoch": 0.2988505747126437, | |
| "grad_norm": 2.351843921213514, | |
| "kl": 0.023681640625, | |
| "learning_rate": 9.701149425287356e-07, | |
| "loss": 0.001, | |
| "reward": 1.804998755455017, | |
| "reward_std": 0.0408855676651001, | |
| "rewards/accuracy_reward": 0.8049987554550171, | |
| "rewards/format_reward": 1.0, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.91732025146484, | |
| "epoch": 0.3103448275862069, | |
| "grad_norm": 1.1953728712395675, | |
| "kl": 0.023193359375, | |
| "learning_rate": 9.689655172413793e-07, | |
| "loss": 0.001, | |
| "reward": 1.8027284145355225, | |
| "reward_std": 0.04391753673553467, | |
| "rewards/accuracy_reward": 0.8033794164657593, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.53841400146484, | |
| "epoch": 0.3218390804597701, | |
| "grad_norm": 1.4473686865312074, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 9.678160919540228e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8385992050170898, | |
| "reward_std": 0.038850087672472, | |
| "rewards/accuracy_reward": 0.8385992050170898, | |
| "rewards/format_reward": 1.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.99153900146484, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 1.7266188910571618, | |
| "kl": 0.02099609375, | |
| "learning_rate": 9.666666666666666e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8396878242492676, | |
| "reward_std": 0.03943753242492676, | |
| "rewards/accuracy_reward": 0.840338945388794, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.44140625, | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 1.264532198419213, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 9.655172413793103e-07, | |
| "loss": 0.0009, | |
| "reward": 1.819528341293335, | |
| "reward_std": 0.04290936142206192, | |
| "rewards/accuracy_reward": 0.8195282816886902, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.53255462646484, | |
| "epoch": 0.3563218390804598, | |
| "grad_norm": 1.7079785748793026, | |
| "kl": 0.0220947265625, | |
| "learning_rate": 9.643678160919539e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8193325996398926, | |
| "reward_std": 0.039525143802165985, | |
| "rewards/accuracy_reward": 0.8193327188491821, | |
| "rewards/format_reward": 1.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.01237487792969, | |
| "epoch": 0.367816091954023, | |
| "grad_norm": 1.1050680979361267, | |
| "kl": 0.018310546875, | |
| "learning_rate": 9.632183908045976e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8326280117034912, | |
| "reward_std": 0.03926192969083786, | |
| "rewards/accuracy_reward": 0.833279013633728, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.38542175292969, | |
| "epoch": 0.3793103448275862, | |
| "grad_norm": 1.9594290453169385, | |
| "kl": 0.018310546875, | |
| "learning_rate": 9.620689655172413e-07, | |
| "loss": 0.0008, | |
| "reward": 1.830482840538025, | |
| "reward_std": 0.03912848234176636, | |
| "rewards/accuracy_reward": 0.8304828405380249, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.35482025146484, | |
| "epoch": 0.39080459770114945, | |
| "grad_norm": 1.1967253632472934, | |
| "kl": 0.01806640625, | |
| "learning_rate": 9.609195402298849e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8504703044891357, | |
| "reward_std": 0.03594374656677246, | |
| "rewards/accuracy_reward": 0.8504700660705566, | |
| "rewards/format_reward": 1.0, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.44271087646484, | |
| "epoch": 0.40229885057471265, | |
| "grad_norm": 1.4075688908737447, | |
| "kl": 0.020751953125, | |
| "learning_rate": 9.597701149425286e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8235161304473877, | |
| "reward_std": 0.03952939808368683, | |
| "rewards/accuracy_reward": 0.8241671919822693, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.69010925292969, | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 1.1584071923987709, | |
| "kl": 0.0166015625, | |
| "learning_rate": 9.586206896551724e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8258438110351562, | |
| "reward_std": 0.03740541636943817, | |
| "rewards/accuracy_reward": 0.8258438110351562, | |
| "rewards/format_reward": 1.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.52083587646484, | |
| "epoch": 0.42528735632183906, | |
| "grad_norm": 3.559550579405242, | |
| "kl": 0.017578125, | |
| "learning_rate": 9.57471264367816e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8376644849777222, | |
| "reward_std": 0.03800010681152344, | |
| "rewards/accuracy_reward": 0.8376644849777222, | |
| "rewards/format_reward": 1.0, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.80078125, | |
| "epoch": 0.4367816091954023, | |
| "grad_norm": 1.414167289320497, | |
| "kl": 0.017333984375, | |
| "learning_rate": 9.563218390804596e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8298163414001465, | |
| "reward_std": 0.03874580189585686, | |
| "rewards/accuracy_reward": 0.8304674029350281, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.865234375, | |
| "epoch": 0.4482758620689655, | |
| "grad_norm": 3.3277943720668635, | |
| "kl": 0.01806640625, | |
| "learning_rate": 9.551724137931034e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8412444591522217, | |
| "reward_std": 0.0331571064889431, | |
| "rewards/accuracy_reward": 0.8412443995475769, | |
| "rewards/format_reward": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.65625, | |
| "epoch": 0.45977011494252873, | |
| "grad_norm": 1.3938070271983027, | |
| "kl": 0.01904296875, | |
| "learning_rate": 9.540229885057471e-07, | |
| "loss": 0.0008, | |
| "reward": 1.829883098602295, | |
| "reward_std": 0.03403263911604881, | |
| "rewards/accuracy_reward": 0.8298830986022949, | |
| "rewards/format_reward": 1.0, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.31315612792969, | |
| "epoch": 0.47126436781609193, | |
| "grad_norm": 2.4737739620251444, | |
| "kl": 0.01806640625, | |
| "learning_rate": 9.528735632183908e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8481197357177734, | |
| "reward_std": 0.03319514915347099, | |
| "rewards/accuracy_reward": 0.8481197357177734, | |
| "rewards/format_reward": 1.0, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.20247650146484, | |
| "epoch": 0.4827586206896552, | |
| "grad_norm": 3.381834125397947, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 9.517241379310345e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8192237615585327, | |
| "reward_std": 0.03670245409011841, | |
| "rewards/accuracy_reward": 0.8198747634887695, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.3125, | |
| "epoch": 0.4942528735632184, | |
| "grad_norm": 1.8762875005665294, | |
| "kl": 0.0189208984375, | |
| "learning_rate": 9.505747126436781e-07, | |
| "loss": 0.0008, | |
| "reward": 1.866302251815796, | |
| "reward_std": 0.029023345559835434, | |
| "rewards/accuracy_reward": 0.8663021922111511, | |
| "rewards/format_reward": 1.0, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.919921875, | |
| "epoch": 0.5057471264367817, | |
| "grad_norm": 1.364282450282579, | |
| "kl": 0.0186767578125, | |
| "learning_rate": 9.494252873563218e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8467683792114258, | |
| "reward_std": 0.032513901591300964, | |
| "rewards/accuracy_reward": 0.8467683792114258, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.521484375, | |
| "epoch": 0.5172413793103449, | |
| "grad_norm": 1.26529494442379, | |
| "kl": 0.0184326171875, | |
| "learning_rate": 9.482758620689655e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8102507591247559, | |
| "reward_std": 0.03791056573390961, | |
| "rewards/accuracy_reward": 0.8115529417991638, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.720703125, | |
| "epoch": 0.5287356321839081, | |
| "grad_norm": 1.9592533419058873, | |
| "kl": 0.0203857421875, | |
| "learning_rate": 9.471264367816092e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8435447216033936, | |
| "reward_std": 0.03397071361541748, | |
| "rewards/accuracy_reward": 0.844195544719696, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.55013275146484, | |
| "epoch": 0.5402298850574713, | |
| "grad_norm": 1.434106375840011, | |
| "kl": 0.01953125, | |
| "learning_rate": 9.459770114942528e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8275771141052246, | |
| "reward_std": 0.039521463215351105, | |
| "rewards/accuracy_reward": 0.8288793563842773, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.88346862792969, | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 1.3331417373343877, | |
| "kl": 0.019775390625, | |
| "learning_rate": 9.448275862068965e-07, | |
| "loss": 0.0008, | |
| "reward": 1.847245216369629, | |
| "reward_std": 0.031749702990055084, | |
| "rewards/accuracy_reward": 0.8472453951835632, | |
| "rewards/format_reward": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.53841400146484, | |
| "epoch": 0.5632183908045977, | |
| "grad_norm": 1.2065807063642335, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 9.436781609195402e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8429206609725952, | |
| "reward_std": 0.030585885047912598, | |
| "rewards/accuracy_reward": 0.8435718417167664, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.23503112792969, | |
| "epoch": 0.5747126436781609, | |
| "grad_norm": 1.8805762744730514, | |
| "kl": 0.0216064453125, | |
| "learning_rate": 9.425287356321838e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8460264205932617, | |
| "reward_std": 0.030056733638048172, | |
| "rewards/accuracy_reward": 0.8460264205932617, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.20964050292969, | |
| "epoch": 0.5862068965517241, | |
| "grad_norm": 2.416272655538971, | |
| "kl": 0.031494140625, | |
| "learning_rate": 9.413793103448276e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8473304510116577, | |
| "reward_std": 0.028418144211173058, | |
| "rewards/accuracy_reward": 0.8473303318023682, | |
| "rewards/format_reward": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.59245300292969, | |
| "epoch": 0.5977011494252874, | |
| "grad_norm": 1.7021853778739644, | |
| "kl": 0.0206298828125, | |
| "learning_rate": 9.402298850574713e-07, | |
| "loss": 0.0009, | |
| "reward": 1.837215781211853, | |
| "reward_std": 0.027952462434768677, | |
| "rewards/accuracy_reward": 0.837215781211853, | |
| "rewards/format_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.181640625, | |
| "epoch": 0.6091954022988506, | |
| "grad_norm": 1.5829777832061342, | |
| "kl": 0.020263671875, | |
| "learning_rate": 9.390804597701148e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8315935134887695, | |
| "reward_std": 0.03717650845646858, | |
| "rewards/accuracy_reward": 0.8315935134887695, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.23112487792969, | |
| "epoch": 0.6206896551724138, | |
| "grad_norm": 1.5951363696277052, | |
| "kl": 0.027099609375, | |
| "learning_rate": 9.379310344827586e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8406846523284912, | |
| "reward_std": 0.031024938449263573, | |
| "rewards/accuracy_reward": 0.8413355350494385, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.03190612792969, | |
| "epoch": 0.632183908045977, | |
| "grad_norm": 1.3835743922707338, | |
| "kl": 0.018798828125, | |
| "learning_rate": 9.367816091954023e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8285382986068726, | |
| "reward_std": 0.02894814871251583, | |
| "rewards/accuracy_reward": 0.8285383582115173, | |
| "rewards/format_reward": 1.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.86849212646484, | |
| "epoch": 0.6436781609195402, | |
| "grad_norm": 1.3591671428361543, | |
| "kl": 0.019287109375, | |
| "learning_rate": 9.356321839080458e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8349835872650146, | |
| "reward_std": 0.033723484724760056, | |
| "rewards/accuracy_reward": 0.8356344699859619, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.513671875, | |
| "epoch": 0.6551724137931034, | |
| "grad_norm": 2.359691828231059, | |
| "kl": 0.02392578125, | |
| "learning_rate": 9.344827586206896e-07, | |
| "loss": 0.001, | |
| "reward": 1.8344968557357788, | |
| "reward_std": 0.03300865739583969, | |
| "rewards/accuracy_reward": 0.83514803647995, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.46875, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.4850162206462427, | |
| "kl": 0.020263671875, | |
| "learning_rate": 9.333333333333333e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8346723318099976, | |
| "reward_std": 0.03354714438319206, | |
| "rewards/accuracy_reward": 0.8346724510192871, | |
| "rewards/format_reward": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.7109375, | |
| "epoch": 0.6781609195402298, | |
| "grad_norm": 1.2483974787682879, | |
| "kl": 0.02197265625, | |
| "learning_rate": 9.321839080459771e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8298664093017578, | |
| "reward_std": 0.03340629115700722, | |
| "rewards/accuracy_reward": 0.8305175304412842, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.80078125, | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 2.363012214927635, | |
| "kl": 0.02978515625, | |
| "learning_rate": 9.310344827586206e-07, | |
| "loss": 0.0012, | |
| "reward": 1.831284761428833, | |
| "reward_std": 0.03303222730755806, | |
| "rewards/accuracy_reward": 0.8319358229637146, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.81771087646484, | |
| "epoch": 0.7011494252873564, | |
| "grad_norm": 4.0916014790353445, | |
| "kl": 0.0196533203125, | |
| "learning_rate": 9.298850574712643e-07, | |
| "loss": 0.0008, | |
| "reward": 1.839867353439331, | |
| "reward_std": 0.031664229929447174, | |
| "rewards/accuracy_reward": 0.8398674726486206, | |
| "rewards/format_reward": 1.0, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.52214050292969, | |
| "epoch": 0.7126436781609196, | |
| "grad_norm": 3.2309398976005297, | |
| "kl": 0.022216796875, | |
| "learning_rate": 9.287356321839081e-07, | |
| "loss": 0.0009, | |
| "reward": 1.854928970336914, | |
| "reward_std": 0.031918901950120926, | |
| "rewards/accuracy_reward": 0.8549291491508484, | |
| "rewards/format_reward": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.703125, | |
| "epoch": 0.7241379310344828, | |
| "grad_norm": 1.9168568902881002, | |
| "kl": 0.019287109375, | |
| "learning_rate": 9.275862068965516e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8610057830810547, | |
| "reward_std": 0.02559598907828331, | |
| "rewards/accuracy_reward": 0.8610057234764099, | |
| "rewards/format_reward": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.36067962646484, | |
| "epoch": 0.735632183908046, | |
| "grad_norm": 1.3566272735502378, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 9.264367816091954e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8530144691467285, | |
| "reward_std": 0.0317344069480896, | |
| "rewards/accuracy_reward": 0.8530145883560181, | |
| "rewards/format_reward": 1.0, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.51888275146484, | |
| "epoch": 0.7471264367816092, | |
| "grad_norm": 1.9418693251170596, | |
| "kl": 0.01806640625, | |
| "learning_rate": 9.252873563218391e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8553061485290527, | |
| "reward_std": 0.028751468285918236, | |
| "rewards/accuracy_reward": 0.8553061485290527, | |
| "rewards/format_reward": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.462890625, | |
| "epoch": 0.7586206896551724, | |
| "grad_norm": 1.6584510097653566, | |
| "kl": 0.0179443359375, | |
| "learning_rate": 9.241379310344826e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8472495079040527, | |
| "reward_std": 0.029529428109526634, | |
| "rewards/accuracy_reward": 0.8472495079040527, | |
| "rewards/format_reward": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.38671875, | |
| "epoch": 0.7701149425287356, | |
| "grad_norm": 1.5373869592745633, | |
| "kl": 0.0185546875, | |
| "learning_rate": 9.229885057471264e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8314546346664429, | |
| "reward_std": 0.03092101775109768, | |
| "rewards/accuracy_reward": 0.8314546346664429, | |
| "rewards/format_reward": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.568359375, | |
| "epoch": 0.7816091954022989, | |
| "grad_norm": 1.4559609132563736, | |
| "kl": 0.01806640625, | |
| "learning_rate": 9.218390804597701e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8663041591644287, | |
| "reward_std": 0.026553651317954063, | |
| "rewards/accuracy_reward": 0.8663042783737183, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.46419525146484, | |
| "epoch": 0.7931034482758621, | |
| "grad_norm": 1.2008629627793295, | |
| "kl": 0.0186767578125, | |
| "learning_rate": 9.206896551724138e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8613271713256836, | |
| "reward_std": 0.02855812758207321, | |
| "rewards/accuracy_reward": 0.8613271713256836, | |
| "rewards/format_reward": 1.0, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.99153900146484, | |
| "epoch": 0.8045977011494253, | |
| "grad_norm": 4.9275967882281275, | |
| "kl": 0.020751953125, | |
| "learning_rate": 9.195402298850574e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8591216802597046, | |
| "reward_std": 0.028761819005012512, | |
| "rewards/accuracy_reward": 0.8591216802597046, | |
| "rewards/format_reward": 1.0, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.91536712646484, | |
| "epoch": 0.8160919540229885, | |
| "grad_norm": 9.850480698430928, | |
| "kl": 0.01953125, | |
| "learning_rate": 9.183908045977011e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8541549444198608, | |
| "reward_std": 0.028836514800786972, | |
| "rewards/accuracy_reward": 0.8548059463500977, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.697265625, | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 1.210498300407391, | |
| "kl": 0.0201416015625, | |
| "learning_rate": 9.172413793103448e-07, | |
| "loss": 0.0009, | |
| "reward": 1.843854308128357, | |
| "reward_std": 0.030750762671232224, | |
| "rewards/accuracy_reward": 0.8438543677330017, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.44075775146484, | |
| "epoch": 0.8390804597701149, | |
| "grad_norm": 1.8321539372725062, | |
| "kl": 0.0205078125, | |
| "learning_rate": 9.160919540229884e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8534529209136963, | |
| "reward_std": 0.028409739956259727, | |
| "rewards/accuracy_reward": 0.8534530401229858, | |
| "rewards/format_reward": 1.0, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.00065612792969, | |
| "epoch": 0.8505747126436781, | |
| "grad_norm": 1.1934111085412287, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 9.149425287356322e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8261594772338867, | |
| "reward_std": 0.028989605605602264, | |
| "rewards/accuracy_reward": 0.8268105387687683, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.86458587646484, | |
| "epoch": 0.8620689655172413, | |
| "grad_norm": 1.1973438361715183, | |
| "kl": 0.02294921875, | |
| "learning_rate": 9.137931034482759e-07, | |
| "loss": 0.001, | |
| "reward": 1.8427714109420776, | |
| "reward_std": 0.028574138879776, | |
| "rewards/accuracy_reward": 0.8427714109420776, | |
| "rewards/format_reward": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.2109375, | |
| "epoch": 0.8735632183908046, | |
| "grad_norm": 1.3720840552975169, | |
| "kl": 0.0208740234375, | |
| "learning_rate": 9.126436781609194e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8438997268676758, | |
| "reward_std": 0.034771427512168884, | |
| "rewards/accuracy_reward": 0.8452019095420837, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.87825775146484, | |
| "epoch": 0.8850574712643678, | |
| "grad_norm": 1.2733923638265987, | |
| "kl": 0.021240234375, | |
| "learning_rate": 9.114942528735632e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8525419235229492, | |
| "reward_std": 0.02576223388314247, | |
| "rewards/accuracy_reward": 0.8525419235229492, | |
| "rewards/format_reward": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.30599212646484, | |
| "epoch": 0.896551724137931, | |
| "grad_norm": 1.5683715987496294, | |
| "kl": 0.022705078125, | |
| "learning_rate": 9.103448275862069e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8584070205688477, | |
| "reward_std": 0.026499662548303604, | |
| "rewards/accuracy_reward": 0.8590580821037292, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.30208587646484, | |
| "epoch": 0.9080459770114943, | |
| "grad_norm": 1.7144360008338546, | |
| "kl": 0.0206298828125, | |
| "learning_rate": 9.091954022988505e-07, | |
| "loss": 0.0009, | |
| "reward": 1.836766004562378, | |
| "reward_std": 0.03153412044048309, | |
| "rewards/accuracy_reward": 0.8374168276786804, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.78646087646484, | |
| "epoch": 0.9195402298850575, | |
| "grad_norm": 1.4249106766015625, | |
| "kl": 0.021484375, | |
| "learning_rate": 9.080459770114942e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8593868017196655, | |
| "reward_std": 0.030518915504217148, | |
| "rewards/accuracy_reward": 0.859386682510376, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.84700775146484, | |
| "epoch": 0.9310344827586207, | |
| "grad_norm": 1.6378679289752103, | |
| "kl": 0.0205078125, | |
| "learning_rate": 9.068965517241379e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8533859252929688, | |
| "reward_std": 0.02878081053495407, | |
| "rewards/accuracy_reward": 0.8533859252929688, | |
| "rewards/format_reward": 1.0, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.23567962646484, | |
| "epoch": 0.9425287356321839, | |
| "grad_norm": 1.412691105042645, | |
| "kl": 0.021484375, | |
| "learning_rate": 9.057471264367816e-07, | |
| "loss": 0.0009, | |
| "reward": 1.847813606262207, | |
| "reward_std": 0.03038616292178631, | |
| "rewards/accuracy_reward": 0.8484646677970886, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.27474212646484, | |
| "epoch": 0.9540229885057471, | |
| "grad_norm": 2.0344799768549784, | |
| "kl": 0.0244140625, | |
| "learning_rate": 9.045977011494252e-07, | |
| "loss": 0.001, | |
| "reward": 1.8461253643035889, | |
| "reward_std": 0.03104616329073906, | |
| "rewards/accuracy_reward": 0.8467763662338257, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.6640625, | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 1.5604062393784244, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 9.034482758620689e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8446769714355469, | |
| "reward_std": 0.025367258116602898, | |
| "rewards/accuracy_reward": 0.8446769714355469, | |
| "rewards/format_reward": 1.0, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.02734375, | |
| "epoch": 0.9770114942528736, | |
| "grad_norm": 1.362117300400181, | |
| "kl": 0.0216064453125, | |
| "learning_rate": 9.022988505747126e-07, | |
| "loss": 0.0009, | |
| "reward": 1.850293517112732, | |
| "reward_std": 0.027060410007834435, | |
| "rewards/accuracy_reward": 0.8502935171127319, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.21159362792969, | |
| "epoch": 0.9885057471264368, | |
| "grad_norm": 4.622693800475361, | |
| "kl": 0.0196533203125, | |
| "learning_rate": 9.011494252873562e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8619520664215088, | |
| "reward_std": 0.02936243824660778, | |
| "rewards/accuracy_reward": 0.8626030683517456, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.17837524414062, | |
| "epoch": 1.0, | |
| "grad_norm": 1.1739865165675025, | |
| "kl": 0.0206298828125, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0009, | |
| "reward": 1.865034818649292, | |
| "reward_std": 0.024553239345550537, | |
| "rewards/accuracy_reward": 0.8650349974632263, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.279296875, | |
| "epoch": 1.0114942528735633, | |
| "grad_norm": 1.7212140289390774, | |
| "kl": 0.022216796875, | |
| "learning_rate": 8.988505747126436e-07, | |
| "loss": 0.001, | |
| "reward": 1.844975471496582, | |
| "reward_std": 0.031139438971877098, | |
| "rewards/accuracy_reward": 0.8449755907058716, | |
| "rewards/format_reward": 1.0, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.97721862792969, | |
| "epoch": 1.0229885057471264, | |
| "grad_norm": 2.1208124047907018, | |
| "kl": 0.02294921875, | |
| "learning_rate": 8.977011494252873e-07, | |
| "loss": 0.001, | |
| "reward": 1.8310832977294922, | |
| "reward_std": 0.030721893534064293, | |
| "rewards/accuracy_reward": 0.8310832381248474, | |
| "rewards/format_reward": 1.0, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.72982025146484, | |
| "epoch": 1.0344827586206897, | |
| "grad_norm": 1.7904458786353679, | |
| "kl": 0.025390625, | |
| "learning_rate": 8.96551724137931e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8376115560531616, | |
| "reward_std": 0.03200274333357811, | |
| "rewards/accuracy_reward": 0.8382627367973328, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.40495300292969, | |
| "epoch": 1.0459770114942528, | |
| "grad_norm": 1.6518476245455913, | |
| "kl": 0.0216064453125, | |
| "learning_rate": 8.954022988505747e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8409684896469116, | |
| "reward_std": 0.02987781912088394, | |
| "rewards/accuracy_reward": 0.8416194915771484, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.744140625, | |
| "epoch": 1.0574712643678161, | |
| "grad_norm": 2.410653089678698, | |
| "kl": 0.021484375, | |
| "learning_rate": 8.942528735632184e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8022823333740234, | |
| "reward_std": 0.03017442300915718, | |
| "rewards/accuracy_reward": 0.8022822141647339, | |
| "rewards/format_reward": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.03255462646484, | |
| "epoch": 1.0689655172413792, | |
| "grad_norm": 1.070833843767388, | |
| "kl": 0.0230712890625, | |
| "learning_rate": 8.93103448275862e-07, | |
| "loss": 0.001, | |
| "reward": 1.8557833433151245, | |
| "reward_std": 0.030436282977461815, | |
| "rewards/accuracy_reward": 0.8570854663848877, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.84635925292969, | |
| "epoch": 1.0804597701149425, | |
| "grad_norm": 1.2651791750173826, | |
| "kl": 0.0216064453125, | |
| "learning_rate": 8.919540229885057e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8365752696990967, | |
| "reward_std": 0.025262486189603806, | |
| "rewards/accuracy_reward": 0.8372262716293335, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.35091400146484, | |
| "epoch": 1.0919540229885056, | |
| "grad_norm": 2.5454147273169463, | |
| "kl": 0.0213623046875, | |
| "learning_rate": 8.908045977011494e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8451789617538452, | |
| "reward_std": 0.02507897838950157, | |
| "rewards/accuracy_reward": 0.8451790809631348, | |
| "rewards/format_reward": 1.0, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.16862487792969, | |
| "epoch": 1.103448275862069, | |
| "grad_norm": 1.5956876645494014, | |
| "kl": 0.020263671875, | |
| "learning_rate": 8.896551724137931e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8393771648406982, | |
| "reward_std": 0.02759551629424095, | |
| "rewards/accuracy_reward": 0.8393771052360535, | |
| "rewards/format_reward": 1.0, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.232421875, | |
| "epoch": 1.1149425287356323, | |
| "grad_norm": 1.6538617605670678, | |
| "kl": 0.0230712890625, | |
| "learning_rate": 8.885057471264368e-07, | |
| "loss": 0.001, | |
| "reward": 1.8187679052352905, | |
| "reward_std": 0.028722627088427544, | |
| "rewards/accuracy_reward": 0.8187679052352905, | |
| "rewards/format_reward": 1.0, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.37109375, | |
| "epoch": 1.1264367816091954, | |
| "grad_norm": 2.466954207815668, | |
| "kl": 0.020263671875, | |
| "learning_rate": 8.873563218390804e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8639543056488037, | |
| "reward_std": 0.02820052206516266, | |
| "rewards/accuracy_reward": 0.8646053075790405, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.6171875, | |
| "epoch": 1.1379310344827587, | |
| "grad_norm": 1.252778063732612, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 8.862068965517241e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8373963832855225, | |
| "reward_std": 0.027982115745544434, | |
| "rewards/accuracy_reward": 0.837396502494812, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.939453125, | |
| "epoch": 1.1494252873563218, | |
| "grad_norm": 1.5031364905308602, | |
| "kl": 0.021240234375, | |
| "learning_rate": 8.850574712643678e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8384751081466675, | |
| "reward_std": 0.02889677882194519, | |
| "rewards/accuracy_reward": 0.8384751081466675, | |
| "rewards/format_reward": 1.0, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.173828125, | |
| "epoch": 1.160919540229885, | |
| "grad_norm": 3.2098798594469957, | |
| "kl": 0.022216796875, | |
| "learning_rate": 8.839080459770114e-07, | |
| "loss": 0.001, | |
| "reward": 1.858123779296875, | |
| "reward_std": 0.027859613299369812, | |
| "rewards/accuracy_reward": 0.8581236600875854, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.611328125, | |
| "epoch": 1.1724137931034484, | |
| "grad_norm": 1.6395969811083824, | |
| "kl": 0.022705078125, | |
| "learning_rate": 8.827586206896551e-07, | |
| "loss": 0.001, | |
| "reward": 1.8613924980163574, | |
| "reward_std": 0.030063219368457794, | |
| "rewards/accuracy_reward": 0.8626946806907654, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.9609375, | |
| "epoch": 1.1839080459770115, | |
| "grad_norm": 1.5759802737821602, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 8.816091954022988e-07, | |
| "loss": 0.001, | |
| "reward": 1.87135648727417, | |
| "reward_std": 0.025703629478812218, | |
| "rewards/accuracy_reward": 0.8713564872741699, | |
| "rewards/format_reward": 1.0, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.47591400146484, | |
| "epoch": 1.1954022988505748, | |
| "grad_norm": 2.1153792944695753, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 8.804597701149424e-07, | |
| "loss": 0.001, | |
| "reward": 1.8344066143035889, | |
| "reward_std": 0.02677982673048973, | |
| "rewards/accuracy_reward": 0.8344065546989441, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.17513275146484, | |
| "epoch": 1.206896551724138, | |
| "grad_norm": 1.5729705960989235, | |
| "kl": 0.0223388671875, | |
| "learning_rate": 8.793103448275862e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8618988990783691, | |
| "reward_std": 0.025562942028045654, | |
| "rewards/accuracy_reward": 0.8618988990783691, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.10417175292969, | |
| "epoch": 1.2183908045977012, | |
| "grad_norm": 1.5280787952862458, | |
| "kl": 0.021484375, | |
| "learning_rate": 8.781609195402299e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8250049352645874, | |
| "reward_std": 0.030421411618590355, | |
| "rewards/accuracy_reward": 0.8263069987297058, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.99674987792969, | |
| "epoch": 1.2298850574712643, | |
| "grad_norm": 1.3417425340508164, | |
| "kl": 0.02294921875, | |
| "learning_rate": 8.770114942528735e-07, | |
| "loss": 0.001, | |
| "reward": 1.8627097606658936, | |
| "reward_std": 0.029694318771362305, | |
| "rewards/accuracy_reward": 0.8633607625961304, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.16732025146484, | |
| "epoch": 1.2413793103448276, | |
| "grad_norm": 1.3637947595566382, | |
| "kl": 0.023193359375, | |
| "learning_rate": 8.758620689655172e-07, | |
| "loss": 0.001, | |
| "reward": 1.8729504346847534, | |
| "reward_std": 0.02589436247944832, | |
| "rewards/accuracy_reward": 0.8729504942893982, | |
| "rewards/format_reward": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.79232025146484, | |
| "epoch": 1.2528735632183907, | |
| "grad_norm": 1.8473222859814702, | |
| "kl": 0.02294921875, | |
| "learning_rate": 8.747126436781609e-07, | |
| "loss": 0.001, | |
| "reward": 1.8625543117523193, | |
| "reward_std": 0.026772310957312584, | |
| "rewards/accuracy_reward": 0.8625543117523193, | |
| "rewards/format_reward": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.390625, | |
| "epoch": 1.264367816091954, | |
| "grad_norm": 1.6150275987926945, | |
| "kl": 0.024658203125, | |
| "learning_rate": 8.735632183908046e-07, | |
| "loss": 0.001, | |
| "reward": 1.8422784805297852, | |
| "reward_std": 0.030604541301727295, | |
| "rewards/accuracy_reward": 0.8435806632041931, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.96354675292969, | |
| "epoch": 1.2758620689655173, | |
| "grad_norm": 1.7104109971917698, | |
| "kl": 0.03125, | |
| "learning_rate": 8.724137931034482e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8350896835327148, | |
| "reward_std": 0.028940599411725998, | |
| "rewards/accuracy_reward": 0.8357405662536621, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.47135925292969, | |
| "epoch": 1.2873563218390804, | |
| "grad_norm": 1.48493737972391, | |
| "kl": 0.02490234375, | |
| "learning_rate": 8.712643678160919e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8481062650680542, | |
| "reward_std": 0.029237091541290283, | |
| "rewards/accuracy_reward": 0.8481062650680542, | |
| "rewards/format_reward": 1.0, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.853515625, | |
| "epoch": 1.2988505747126438, | |
| "grad_norm": 1.8812767455198556, | |
| "kl": 0.024169921875, | |
| "learning_rate": 8.701149425287357e-07, | |
| "loss": 0.001, | |
| "reward": 1.8743027448654175, | |
| "reward_std": 0.028526946902275085, | |
| "rewards/accuracy_reward": 0.8743027448654175, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.11653900146484, | |
| "epoch": 1.3103448275862069, | |
| "grad_norm": 1.4552951626639703, | |
| "kl": 0.0244140625, | |
| "learning_rate": 8.689655172413792e-07, | |
| "loss": 0.001, | |
| "reward": 1.8737125396728516, | |
| "reward_std": 0.02662818133831024, | |
| "rewards/accuracy_reward": 0.8737127184867859, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.68620300292969, | |
| "epoch": 1.3218390804597702, | |
| "grad_norm": 1.2099323721036837, | |
| "kl": 0.0242919921875, | |
| "learning_rate": 8.67816091954023e-07, | |
| "loss": 0.001, | |
| "reward": 1.8564056158065796, | |
| "reward_std": 0.03203584998846054, | |
| "rewards/accuracy_reward": 0.8570567965507507, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.44140625, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 1.512025900233657, | |
| "kl": 0.0262451171875, | |
| "learning_rate": 8.666666666666667e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8566768169403076, | |
| "reward_std": 0.028156783431768417, | |
| "rewards/accuracy_reward": 0.857327938079834, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.33073425292969, | |
| "epoch": 1.3448275862068966, | |
| "grad_norm": 2.0358354675170904, | |
| "kl": 0.02392578125, | |
| "learning_rate": 8.655172413793102e-07, | |
| "loss": 0.001, | |
| "reward": 1.8653233051300049, | |
| "reward_std": 0.03316108509898186, | |
| "rewards/accuracy_reward": 0.8666254281997681, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.40234375, | |
| "epoch": 1.3563218390804597, | |
| "grad_norm": 4.5664679538349136, | |
| "kl": 0.0260009765625, | |
| "learning_rate": 8.64367816091954e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8475828170776367, | |
| "reward_std": 0.028350701555609703, | |
| "rewards/accuracy_reward": 0.8475827574729919, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.509765625, | |
| "epoch": 1.367816091954023, | |
| "grad_norm": 3.3498974515563305, | |
| "kl": 0.02392578125, | |
| "learning_rate": 8.632183908045977e-07, | |
| "loss": 0.001, | |
| "reward": 1.8256216049194336, | |
| "reward_std": 0.03040180169045925, | |
| "rewards/accuracy_reward": 0.8256216049194336, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.119140625, | |
| "epoch": 1.3793103448275863, | |
| "grad_norm": 2.0523739279685618, | |
| "kl": 0.03564453125, | |
| "learning_rate": 8.620689655172412e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8660404682159424, | |
| "reward_std": 0.02924610674381256, | |
| "rewards/accuracy_reward": 0.8660405874252319, | |
| "rewards/format_reward": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.34440612792969, | |
| "epoch": 1.3908045977011494, | |
| "grad_norm": 4.105539506193381, | |
| "kl": 0.0361328125, | |
| "learning_rate": 8.60919540229885e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8672317266464233, | |
| "reward_std": 0.0260650422424078, | |
| "rewards/accuracy_reward": 0.8672318458557129, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.32878112792969, | |
| "epoch": 1.4022988505747127, | |
| "grad_norm": 1.7057843588411452, | |
| "kl": 0.0303955078125, | |
| "learning_rate": 8.597701149425287e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8730971813201904, | |
| "reward_std": 0.032900359481573105, | |
| "rewards/accuracy_reward": 0.8737481832504272, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.82421875, | |
| "epoch": 1.4137931034482758, | |
| "grad_norm": 2.6319293895634828, | |
| "kl": 0.024658203125, | |
| "learning_rate": 8.586206896551725e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8619554042816162, | |
| "reward_std": 0.025756051763892174, | |
| "rewards/accuracy_reward": 0.8619554042816162, | |
| "rewards/format_reward": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.46810150146484, | |
| "epoch": 1.4252873563218391, | |
| "grad_norm": 1.8929420183153929, | |
| "kl": 0.0264892578125, | |
| "learning_rate": 8.57471264367816e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8555949926376343, | |
| "reward_std": 0.027035661041736603, | |
| "rewards/accuracy_reward": 0.855595052242279, | |
| "rewards/format_reward": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.79362487792969, | |
| "epoch": 1.4367816091954024, | |
| "grad_norm": 4.4900781638741645, | |
| "kl": 0.0264892578125, | |
| "learning_rate": 8.563218390804597e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8362255096435547, | |
| "reward_std": 0.0311919953674078, | |
| "rewards/accuracy_reward": 0.836225688457489, | |
| "rewards/format_reward": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.28190612792969, | |
| "epoch": 1.4482758620689655, | |
| "grad_norm": 2.686615982073592, | |
| "kl": 0.0294189453125, | |
| "learning_rate": 8.551724137931035e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8675731420516968, | |
| "reward_std": 0.02920936420559883, | |
| "rewards/accuracy_reward": 0.8682241439819336, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.333984375, | |
| "epoch": 1.4597701149425286, | |
| "grad_norm": 1.4366785062548348, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 8.54022988505747e-07, | |
| "loss": 0.001, | |
| "reward": 1.8725385665893555, | |
| "reward_std": 0.028989439830183983, | |
| "rewards/accuracy_reward": 0.8725385665893555, | |
| "rewards/format_reward": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.35286712646484, | |
| "epoch": 1.471264367816092, | |
| "grad_norm": 3.153210014316318, | |
| "kl": 0.025146484375, | |
| "learning_rate": 8.528735632183908e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8572996854782104, | |
| "reward_std": 0.02787836454808712, | |
| "rewards/accuracy_reward": 0.8579506874084473, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.70768737792969, | |
| "epoch": 1.4827586206896552, | |
| "grad_norm": 1.312276972160894, | |
| "kl": 0.023681640625, | |
| "learning_rate": 8.517241379310345e-07, | |
| "loss": 0.001, | |
| "reward": 1.8692939281463623, | |
| "reward_std": 0.028887853026390076, | |
| "rewards/accuracy_reward": 0.8692940473556519, | |
| "rewards/format_reward": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.83333587646484, | |
| "epoch": 1.4942528735632183, | |
| "grad_norm": 1.7208602570293514, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 8.50574712643678e-07, | |
| "loss": 0.0012, | |
| "reward": 1.86876380443573, | |
| "reward_std": 0.026345182210206985, | |
| "rewards/accuracy_reward": 0.86876380443573, | |
| "rewards/format_reward": 1.0, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.58919525146484, | |
| "epoch": 1.5057471264367817, | |
| "grad_norm": 1.5360698582173107, | |
| "kl": 0.02490234375, | |
| "learning_rate": 8.494252873563218e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8543965816497803, | |
| "reward_std": 0.025941472500562668, | |
| "rewards/accuracy_reward": 0.8543965816497803, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.38607025146484, | |
| "epoch": 1.5172413793103448, | |
| "grad_norm": 1.5741802825985347, | |
| "kl": 0.0244140625, | |
| "learning_rate": 8.482758620689655e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8572925329208374, | |
| "reward_std": 0.02648422122001648, | |
| "rewards/accuracy_reward": 0.8572925329208374, | |
| "rewards/format_reward": 1.0, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.529296875, | |
| "epoch": 1.528735632183908, | |
| "grad_norm": 2.5696729376896577, | |
| "kl": 0.02685546875, | |
| "learning_rate": 8.471264367816092e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8575305938720703, | |
| "reward_std": 0.023815011605620384, | |
| "rewards/accuracy_reward": 0.8575305938720703, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.123046875, | |
| "epoch": 1.5402298850574714, | |
| "grad_norm": 1.6144061698095113, | |
| "kl": 0.029052734375, | |
| "learning_rate": 8.459770114942528e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8380780220031738, | |
| "reward_std": 0.02718261629343033, | |
| "rewards/accuracy_reward": 0.8380780220031738, | |
| "rewards/format_reward": 1.0, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.32552337646484, | |
| "epoch": 1.5517241379310345, | |
| "grad_norm": 1.4219475281295146, | |
| "kl": 0.0281982421875, | |
| "learning_rate": 8.448275862068965e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8486030101776123, | |
| "reward_std": 0.02719944715499878, | |
| "rewards/accuracy_reward": 0.8492540121078491, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.57747650146484, | |
| "epoch": 1.5632183908045976, | |
| "grad_norm": 1.2505748762461004, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 8.436781609195402e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8550881147384644, | |
| "reward_std": 0.024348240345716476, | |
| "rewards/accuracy_reward": 0.8550881147384644, | |
| "rewards/format_reward": 1.0, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.931640625, | |
| "epoch": 1.5747126436781609, | |
| "grad_norm": 1.4653956935392716, | |
| "kl": 0.030517578125, | |
| "learning_rate": 8.425287356321838e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8590949773788452, | |
| "reward_std": 0.024702435359358788, | |
| "rewards/accuracy_reward": 0.8597459197044373, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.66796875, | |
| "epoch": 1.5862068965517242, | |
| "grad_norm": 1.5116616200702753, | |
| "kl": 0.0257568359375, | |
| "learning_rate": 8.413793103448276e-07, | |
| "loss": 0.0011, | |
| "reward": 1.865983247756958, | |
| "reward_std": 0.02651612088084221, | |
| "rewards/accuracy_reward": 0.8659831881523132, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.427734375, | |
| "epoch": 1.5977011494252875, | |
| "grad_norm": 2.539622540808917, | |
| "kl": 0.026123046875, | |
| "learning_rate": 8.402298850574713e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8506149053573608, | |
| "reward_std": 0.024511417374014854, | |
| "rewards/accuracy_reward": 0.8506147861480713, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.677734375, | |
| "epoch": 1.6091954022988506, | |
| "grad_norm": 2.7029566355060575, | |
| "kl": 0.030029296875, | |
| "learning_rate": 8.390804597701148e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8749054670333862, | |
| "reward_std": 0.022635221481323242, | |
| "rewards/accuracy_reward": 0.8749054670333862, | |
| "rewards/format_reward": 1.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.28190612792969, | |
| "epoch": 1.6206896551724137, | |
| "grad_norm": 1.6125529702224055, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 8.379310344827586e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8687496185302734, | |
| "reward_std": 0.02769436687231064, | |
| "rewards/accuracy_reward": 0.8687496185302734, | |
| "rewards/format_reward": 1.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.73177337646484, | |
| "epoch": 1.632183908045977, | |
| "grad_norm": 1.6175458343103775, | |
| "kl": 0.03076171875, | |
| "learning_rate": 8.367816091954023e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8663349151611328, | |
| "reward_std": 0.028380822390317917, | |
| "rewards/accuracy_reward": 0.8663349151611328, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.64128112792969, | |
| "epoch": 1.6436781609195403, | |
| "grad_norm": 1.5215374212801362, | |
| "kl": 0.03515625, | |
| "learning_rate": 8.35632183908046e-07, | |
| "loss": 0.0015, | |
| "reward": 1.850799560546875, | |
| "reward_std": 0.026175182312726974, | |
| "rewards/accuracy_reward": 0.8507994413375854, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.19792175292969, | |
| "epoch": 1.6551724137931034, | |
| "grad_norm": 1.3135082334407917, | |
| "kl": 0.035400390625, | |
| "learning_rate": 8.344827586206896e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8789458274841309, | |
| "reward_std": 0.023477703332901, | |
| "rewards/accuracy_reward": 0.8789458274841309, | |
| "rewards/format_reward": 1.0, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.71940612792969, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.7597191781848593, | |
| "kl": 0.0400390625, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.862714171409607, | |
| "reward_std": 0.02495882660150528, | |
| "rewards/accuracy_reward": 0.8627142906188965, | |
| "rewards/format_reward": 1.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.86653900146484, | |
| "epoch": 1.6781609195402298, | |
| "grad_norm": 1.1271572137611656, | |
| "kl": 0.041015625, | |
| "learning_rate": 8.32183908045977e-07, | |
| "loss": 0.0017, | |
| "reward": 1.863875389099121, | |
| "reward_std": 0.022430753335356712, | |
| "rewards/accuracy_reward": 0.8638753890991211, | |
| "rewards/format_reward": 1.0, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.78971862792969, | |
| "epoch": 1.6896551724137931, | |
| "grad_norm": 1.7900229849601172, | |
| "kl": 0.0361328125, | |
| "learning_rate": 8.310344827586206e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8638134002685547, | |
| "reward_std": 0.024309411644935608, | |
| "rewards/accuracy_reward": 0.8638134002685547, | |
| "rewards/format_reward": 1.0, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.46159362792969, | |
| "epoch": 1.7011494252873565, | |
| "grad_norm": 1.4211028736618636, | |
| "kl": 0.03466796875, | |
| "learning_rate": 8.298850574712643e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8384380340576172, | |
| "reward_std": 0.027933349832892418, | |
| "rewards/accuracy_reward": 0.8390890955924988, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.04817962646484, | |
| "epoch": 1.7126436781609196, | |
| "grad_norm": 1.6133215111280255, | |
| "kl": 0.033935546875, | |
| "learning_rate": 8.28735632183908e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8644790649414062, | |
| "reward_std": 0.023334093391895294, | |
| "rewards/accuracy_reward": 0.8644790649414062, | |
| "rewards/format_reward": 1.0, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.50130462646484, | |
| "epoch": 1.7241379310344827, | |
| "grad_norm": 1.71716415208747, | |
| "kl": 0.0380859375, | |
| "learning_rate": 8.275862068965517e-07, | |
| "loss": 0.0016, | |
| "reward": 1.863791823387146, | |
| "reward_std": 0.025837119668722153, | |
| "rewards/accuracy_reward": 0.8644427061080933, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.55339050292969, | |
| "epoch": 1.735632183908046, | |
| "grad_norm": 1.7799429692812343, | |
| "kl": 0.032470703125, | |
| "learning_rate": 8.264367816091954e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8520851135253906, | |
| "reward_std": 0.02567559853196144, | |
| "rewards/accuracy_reward": 0.8520849943161011, | |
| "rewards/format_reward": 1.0, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.16015625, | |
| "epoch": 1.7471264367816093, | |
| "grad_norm": 6.845861661718817, | |
| "kl": 0.033203125, | |
| "learning_rate": 8.25287356321839e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8801301717758179, | |
| "reward_std": 0.02385757677257061, | |
| "rewards/accuracy_reward": 0.8801302909851074, | |
| "rewards/format_reward": 1.0, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.34635925292969, | |
| "epoch": 1.7586206896551724, | |
| "grad_norm": 2.31131545485098, | |
| "kl": 0.030029296875, | |
| "learning_rate": 8.241379310344827e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8744109869003296, | |
| "reward_std": 0.02264220081269741, | |
| "rewards/accuracy_reward": 0.8744109869003296, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.56901550292969, | |
| "epoch": 1.7701149425287355, | |
| "grad_norm": 1.773846804268088, | |
| "kl": 0.0302734375, | |
| "learning_rate": 8.229885057471264e-07, | |
| "loss": 0.0013, | |
| "reward": 1.864478349685669, | |
| "reward_std": 0.022709330543875694, | |
| "rewards/accuracy_reward": 0.864478349685669, | |
| "rewards/format_reward": 1.0, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.39583587646484, | |
| "epoch": 1.7816091954022988, | |
| "grad_norm": 3.139231735878295, | |
| "kl": 0.03662109375, | |
| "learning_rate": 8.218390804597701e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8694690465927124, | |
| "reward_std": 0.023755362257361412, | |
| "rewards/accuracy_reward": 0.8701201677322388, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.47786712646484, | |
| "epoch": 1.793103448275862, | |
| "grad_norm": 1.978341379497861, | |
| "kl": 0.033203125, | |
| "learning_rate": 8.206896551724138e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8873121738433838, | |
| "reward_std": 0.02343663200736046, | |
| "rewards/accuracy_reward": 0.8873120546340942, | |
| "rewards/format_reward": 1.0, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.451171875, | |
| "epoch": 1.8045977011494254, | |
| "grad_norm": 1.9961053634302923, | |
| "kl": 0.031494140625, | |
| "learning_rate": 8.195402298850574e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8565791845321655, | |
| "reward_std": 0.02433241717517376, | |
| "rewards/accuracy_reward": 0.8565791845321655, | |
| "rewards/format_reward": 1.0, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.66015625, | |
| "epoch": 1.8160919540229885, | |
| "grad_norm": 1.4619162689448402, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 8.183908045977011e-07, | |
| "loss": 0.0012, | |
| "reward": 1.893612265586853, | |
| "reward_std": 0.020092740654945374, | |
| "rewards/accuracy_reward": 0.893612265586853, | |
| "rewards/format_reward": 1.0, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.15299987792969, | |
| "epoch": 1.8275862068965516, | |
| "grad_norm": 1.3651687933062773, | |
| "kl": 0.0283203125, | |
| "learning_rate": 8.172413793103448e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8662446737289429, | |
| "reward_std": 0.02392234466969967, | |
| "rewards/accuracy_reward": 0.8662446737289429, | |
| "rewards/format_reward": 1.0, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.61458587646484, | |
| "epoch": 1.839080459770115, | |
| "grad_norm": 1.2135275799512748, | |
| "kl": 0.0262451171875, | |
| "learning_rate": 8.160919540229885e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8714094161987305, | |
| "reward_std": 0.024804111570119858, | |
| "rewards/accuracy_reward": 0.8714094161987305, | |
| "rewards/format_reward": 1.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.740234375, | |
| "epoch": 1.8505747126436782, | |
| "grad_norm": 1.8554584998042072, | |
| "kl": 0.03125, | |
| "learning_rate": 8.149425287356322e-07, | |
| "loss": 0.0013, | |
| "reward": 1.885859489440918, | |
| "reward_std": 0.0239472147077322, | |
| "rewards/accuracy_reward": 0.8858596682548523, | |
| "rewards/format_reward": 1.0, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.990234375, | |
| "epoch": 1.8620689655172413, | |
| "grad_norm": 2.7992855237483445, | |
| "kl": 0.027587890625, | |
| "learning_rate": 8.137931034482758e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8619565963745117, | |
| "reward_std": 0.025602132081985474, | |
| "rewards/accuracy_reward": 0.8619565963745117, | |
| "rewards/format_reward": 1.0, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.46810150146484, | |
| "epoch": 1.8735632183908046, | |
| "grad_norm": 1.153159461213296, | |
| "kl": 0.0291748046875, | |
| "learning_rate": 8.126436781609195e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8758127689361572, | |
| "reward_std": 0.02637392282485962, | |
| "rewards/accuracy_reward": 0.876463770866394, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.09440612792969, | |
| "epoch": 1.8850574712643677, | |
| "grad_norm": 1.8638466478476596, | |
| "kl": 0.031005859375, | |
| "learning_rate": 8.114942528735632e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8715770244598389, | |
| "reward_std": 0.02399243414402008, | |
| "rewards/accuracy_reward": 0.8715770244598389, | |
| "rewards/format_reward": 1.0, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.47265625, | |
| "epoch": 1.896551724137931, | |
| "grad_norm": 1.1827867594913961, | |
| "kl": 0.0291748046875, | |
| "learning_rate": 8.103448275862068e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8500595092773438, | |
| "reward_std": 0.030623754486441612, | |
| "rewards/accuracy_reward": 0.8520126342773438, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.31510925292969, | |
| "epoch": 1.9080459770114944, | |
| "grad_norm": 3.8800308289928274, | |
| "kl": 0.037353515625, | |
| "learning_rate": 8.091954022988506e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8412752151489258, | |
| "reward_std": 0.02564075216650963, | |
| "rewards/accuracy_reward": 0.8412752151489258, | |
| "rewards/format_reward": 1.0, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.9921875, | |
| "epoch": 1.9195402298850575, | |
| "grad_norm": 1.7793621376166886, | |
| "kl": 0.0322265625, | |
| "learning_rate": 8.080459770114942e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8666815757751465, | |
| "reward_std": 0.01957223378121853, | |
| "rewards/accuracy_reward": 0.8666815757751465, | |
| "rewards/format_reward": 1.0, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.001953125, | |
| "epoch": 1.9310344827586206, | |
| "grad_norm": 3.366934016860489, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 8.068965517241378e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8858134746551514, | |
| "reward_std": 0.021044649183750153, | |
| "rewards/accuracy_reward": 0.8858134150505066, | |
| "rewards/format_reward": 1.0, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.40495300292969, | |
| "epoch": 1.9425287356321839, | |
| "grad_norm": 1.4480350054291504, | |
| "kl": 0.06689453125, | |
| "learning_rate": 8.057471264367816e-07, | |
| "loss": 0.0028, | |
| "reward": 1.888282299041748, | |
| "reward_std": 0.025501668453216553, | |
| "rewards/accuracy_reward": 0.8889333605766296, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.05339050292969, | |
| "epoch": 1.9540229885057472, | |
| "grad_norm": 1.9158676691938177, | |
| "kl": 0.0322265625, | |
| "learning_rate": 8.045977011494253e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8491878509521484, | |
| "reward_std": 0.023884495720267296, | |
| "rewards/accuracy_reward": 0.8491878509521484, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.62760925292969, | |
| "epoch": 1.9655172413793105, | |
| "grad_norm": 2.780653624143398, | |
| "kl": 0.033203125, | |
| "learning_rate": 8.03448275862069e-07, | |
| "loss": 0.0014, | |
| "reward": 1.887178897857666, | |
| "reward_std": 0.02224629744887352, | |
| "rewards/accuracy_reward": 0.887178897857666, | |
| "rewards/format_reward": 1.0, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.57096862792969, | |
| "epoch": 1.9770114942528736, | |
| "grad_norm": 1.9318099805591311, | |
| "kl": 0.031494140625, | |
| "learning_rate": 8.022988505747126e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8778488636016846, | |
| "reward_std": 0.024461418390274048, | |
| "rewards/accuracy_reward": 0.8784998655319214, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.85612487792969, | |
| "epoch": 1.9885057471264367, | |
| "grad_norm": 1.8812732346859014, | |
| "kl": 0.031982421875, | |
| "learning_rate": 8.011494252873563e-07, | |
| "loss": 0.0013, | |
| "reward": 1.896362543106079, | |
| "reward_std": 0.021257508546113968, | |
| "rewards/accuracy_reward": 0.8963624238967896, | |
| "rewards/format_reward": 1.0, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.70365142822266, | |
| "epoch": 2.0, | |
| "grad_norm": 4.371328763954834, | |
| "kl": 0.033935546875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8666094541549683, | |
| "reward_std": 0.023954574018716812, | |
| "rewards/accuracy_reward": 0.8680139780044556, | |
| "rewards/format_reward": 0.9985955357551575, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.845703125, | |
| "epoch": 2.0114942528735633, | |
| "grad_norm": 3.642725753887041, | |
| "kl": 0.028564453125, | |
| "learning_rate": 7.988505747126436e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8493850231170654, | |
| "reward_std": 0.021547168493270874, | |
| "rewards/accuracy_reward": 0.8493850231170654, | |
| "rewards/format_reward": 1.0, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.64518737792969, | |
| "epoch": 2.0229885057471266, | |
| "grad_norm": 1.813754894659516, | |
| "kl": 0.0308837890625, | |
| "learning_rate": 7.977011494252873e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8406703472137451, | |
| "reward_std": 0.022422365844249725, | |
| "rewards/accuracy_reward": 0.8406702876091003, | |
| "rewards/format_reward": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.38021087646484, | |
| "epoch": 2.0344827586206895, | |
| "grad_norm": 1.1813724927973488, | |
| "kl": 0.030029296875, | |
| "learning_rate": 7.965517241379311e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8357830047607422, | |
| "reward_std": 0.02784503623843193, | |
| "rewards/accuracy_reward": 0.8370852470397949, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.87890625, | |
| "epoch": 2.045977011494253, | |
| "grad_norm": 1.8350693828851994, | |
| "kl": 0.032470703125, | |
| "learning_rate": 7.954022988505746e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8329381942749023, | |
| "reward_std": 0.02206650748848915, | |
| "rewards/accuracy_reward": 0.8329381942749023, | |
| "rewards/format_reward": 1.0, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.98112487792969, | |
| "epoch": 2.057471264367816, | |
| "grad_norm": 6.041427121899008, | |
| "kl": 0.03662109375, | |
| "learning_rate": 7.942528735632184e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8575410842895508, | |
| "reward_std": 0.022278612479567528, | |
| "rewards/accuracy_reward": 0.8575412034988403, | |
| "rewards/format_reward": 1.0, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.130859375, | |
| "epoch": 2.0689655172413794, | |
| "grad_norm": 1.3127737451001564, | |
| "kl": 0.037109375, | |
| "learning_rate": 7.931034482758621e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8538506031036377, | |
| "reward_std": 0.026692640036344528, | |
| "rewards/accuracy_reward": 0.8551527261734009, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.36849212646484, | |
| "epoch": 2.0804597701149423, | |
| "grad_norm": 2.4756435889269186, | |
| "kl": 0.03759765625, | |
| "learning_rate": 7.919540229885056e-07, | |
| "loss": 0.0016, | |
| "reward": 1.867905616760254, | |
| "reward_std": 0.023630604147911072, | |
| "rewards/accuracy_reward": 0.8685566782951355, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.97005462646484, | |
| "epoch": 2.0919540229885056, | |
| "grad_norm": 2.0378313336318823, | |
| "kl": 0.039794921875, | |
| "learning_rate": 7.908045977011494e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8557872772216797, | |
| "reward_std": 0.024489006027579308, | |
| "rewards/accuracy_reward": 0.856438159942627, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.88802337646484, | |
| "epoch": 2.103448275862069, | |
| "grad_norm": 1.9136054549569206, | |
| "kl": 0.042724609375, | |
| "learning_rate": 7.896551724137931e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8639075756072998, | |
| "reward_std": 0.025589074939489365, | |
| "rewards/accuracy_reward": 0.865860641002655, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.36653900146484, | |
| "epoch": 2.1149425287356323, | |
| "grad_norm": 2.633698077075855, | |
| "kl": 0.038330078125, | |
| "learning_rate": 7.885057471264366e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8632612228393555, | |
| "reward_std": 0.030802268534898758, | |
| "rewards/accuracy_reward": 0.8671674728393555, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.41796875, | |
| "epoch": 2.1264367816091956, | |
| "grad_norm": 2.407318367905216, | |
| "kl": 0.03662109375, | |
| "learning_rate": 7.873563218390804e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8410747051239014, | |
| "reward_std": 0.02962721884250641, | |
| "rewards/accuracy_reward": 0.844329833984375, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.22591400146484, | |
| "epoch": 2.1379310344827585, | |
| "grad_norm": 1.506301090880529, | |
| "kl": 0.033935546875, | |
| "learning_rate": 7.862068965517241e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8682615756988525, | |
| "reward_std": 0.03243795782327652, | |
| "rewards/accuracy_reward": 0.8734698295593262, | |
| "rewards/format_reward": 0.9947916865348816, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.826171875, | |
| "epoch": 2.1494252873563218, | |
| "grad_norm": 3.409263140415921, | |
| "kl": 0.03662109375, | |
| "learning_rate": 7.850574712643679e-07, | |
| "loss": 0.0015, | |
| "reward": 1.875942587852478, | |
| "reward_std": 0.03060537576675415, | |
| "rewards/accuracy_reward": 0.8791979551315308, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.181640625, | |
| "epoch": 2.160919540229885, | |
| "grad_norm": 3.163868782667335, | |
| "kl": 0.031005859375, | |
| "learning_rate": 7.839080459770114e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8670661449432373, | |
| "reward_std": 0.03131501376628876, | |
| "rewards/accuracy_reward": 0.8703212738037109, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.701171875, | |
| "epoch": 2.1724137931034484, | |
| "grad_norm": 2.26227947471076, | |
| "kl": 0.0299072265625, | |
| "learning_rate": 7.827586206896552e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8607233762741089, | |
| "reward_std": 0.030661556869745255, | |
| "rewards/accuracy_reward": 0.8639785647392273, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.59505462646484, | |
| "epoch": 2.1839080459770113, | |
| "grad_norm": 2.1434523868036885, | |
| "kl": 0.0308837890625, | |
| "learning_rate": 7.816091954022989e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8534635305404663, | |
| "reward_std": 0.03587997704744339, | |
| "rewards/accuracy_reward": 0.8580207824707031, | |
| "rewards/format_reward": 0.9954427480697632, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.111328125, | |
| "epoch": 2.1954022988505746, | |
| "grad_norm": 1.8017139066303491, | |
| "kl": 0.034423828125, | |
| "learning_rate": 7.804597701149424e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8646326065063477, | |
| "reward_std": 0.031285952776670456, | |
| "rewards/accuracy_reward": 0.8672367930412292, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.44921875, | |
| "epoch": 2.206896551724138, | |
| "grad_norm": 1.4767880401798514, | |
| "kl": 0.0302734375, | |
| "learning_rate": 7.793103448275862e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8693158626556396, | |
| "reward_std": 0.03322295472025871, | |
| "rewards/accuracy_reward": 0.8725711107254028, | |
| "rewards/format_reward": 0.9967448115348816, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.24674987792969, | |
| "epoch": 2.218390804597701, | |
| "grad_norm": 1.1854249053762298, | |
| "kl": 0.0294189453125, | |
| "learning_rate": 7.781609195402299e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8650622367858887, | |
| "reward_std": 0.03180694580078125, | |
| "rewards/accuracy_reward": 0.867666482925415, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.73372650146484, | |
| "epoch": 2.2298850574712645, | |
| "grad_norm": 1.1139391781844512, | |
| "kl": 0.033447265625, | |
| "learning_rate": 7.770114942528734e-07, | |
| "loss": 0.0014, | |
| "reward": 1.87180757522583, | |
| "reward_std": 0.03504088148474693, | |
| "rewards/accuracy_reward": 0.8757138252258301, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.34245300292969, | |
| "epoch": 2.2413793103448274, | |
| "grad_norm": 1.6586210582781218, | |
| "kl": 0.032470703125, | |
| "learning_rate": 7.758620689655172e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8676183223724365, | |
| "reward_std": 0.028974760323762894, | |
| "rewards/accuracy_reward": 0.8689204454421997, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.36653900146484, | |
| "epoch": 2.2528735632183907, | |
| "grad_norm": 1.1693377970360763, | |
| "kl": 0.0361328125, | |
| "learning_rate": 7.747126436781609e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8650755882263184, | |
| "reward_std": 0.03180007264018059, | |
| "rewards/accuracy_reward": 0.8676798343658447, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.658203125, | |
| "epoch": 2.264367816091954, | |
| "grad_norm": 5.251694511136823, | |
| "kl": 0.032470703125, | |
| "learning_rate": 7.735632183908046e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8851394653320312, | |
| "reward_std": 0.028069045394659042, | |
| "rewards/accuracy_reward": 0.8864415884017944, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.24674987792969, | |
| "epoch": 2.2758620689655173, | |
| "grad_norm": 1.3160075358259606, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 7.724137931034482e-07, | |
| "loss": 0.0013, | |
| "reward": 1.871217966079712, | |
| "reward_std": 0.02678101509809494, | |
| "rewards/accuracy_reward": 0.8718689680099487, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.22396087646484, | |
| "epoch": 2.2873563218390807, | |
| "grad_norm": 1.7363926773399032, | |
| "kl": 0.0322265625, | |
| "learning_rate": 7.712643678160919e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8681142330169678, | |
| "reward_std": 0.025049438700079918, | |
| "rewards/accuracy_reward": 0.8687652349472046, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.36328125, | |
| "epoch": 2.2988505747126435, | |
| "grad_norm": 1.6358927960776593, | |
| "kl": 0.032470703125, | |
| "learning_rate": 7.701149425287356e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8802045583724976, | |
| "reward_std": 0.025107329711318016, | |
| "rewards/accuracy_reward": 0.8808557391166687, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.34049987792969, | |
| "epoch": 2.310344827586207, | |
| "grad_norm": 1.9769472874812075, | |
| "kl": 0.034912109375, | |
| "learning_rate": 7.689655172413792e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8836421966552734, | |
| "reward_std": 0.023718908429145813, | |
| "rewards/accuracy_reward": 0.8842934370040894, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.58203125, | |
| "epoch": 2.32183908045977, | |
| "grad_norm": 1.5235580660765125, | |
| "kl": 0.04150390625, | |
| "learning_rate": 7.67816091954023e-07, | |
| "loss": 0.0017, | |
| "reward": 1.872357726097107, | |
| "reward_std": 0.027882186695933342, | |
| "rewards/accuracy_reward": 0.8736597895622253, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.212890625, | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 1.6195272681997732, | |
| "kl": 0.03515625, | |
| "learning_rate": 7.666666666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.864328145980835, | |
| "reward_std": 0.023437950760126114, | |
| "rewards/accuracy_reward": 0.8649791479110718, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.525390625, | |
| "epoch": 2.344827586206897, | |
| "grad_norm": 1.8341465818857292, | |
| "kl": 0.038818359375, | |
| "learning_rate": 7.655172413793102e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8627145290374756, | |
| "reward_std": 0.022373493760824203, | |
| "rewards/accuracy_reward": 0.8627144694328308, | |
| "rewards/format_reward": 1.0, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.689453125, | |
| "epoch": 2.3563218390804597, | |
| "grad_norm": 1.1804395489400663, | |
| "kl": 0.0390625, | |
| "learning_rate": 7.64367816091954e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8903453350067139, | |
| "reward_std": 0.02181018702685833, | |
| "rewards/accuracy_reward": 0.8903453350067139, | |
| "rewards/format_reward": 1.0, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.21875, | |
| "epoch": 2.367816091954023, | |
| "grad_norm": 1.54557267471986, | |
| "kl": 0.03955078125, | |
| "learning_rate": 7.632183908045977e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8669939041137695, | |
| "reward_std": 0.02308422140777111, | |
| "rewards/accuracy_reward": 0.8669940829277039, | |
| "rewards/format_reward": 1.0, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.130859375, | |
| "epoch": 2.3793103448275863, | |
| "grad_norm": 1.6770832687532953, | |
| "kl": 0.041259765625, | |
| "learning_rate": 7.620689655172414e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8921034336090088, | |
| "reward_std": 0.024557670578360558, | |
| "rewards/accuracy_reward": 0.8934054374694824, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.56510925292969, | |
| "epoch": 2.3908045977011496, | |
| "grad_norm": 1.53505457334918, | |
| "kl": 0.04150390625, | |
| "learning_rate": 7.60919540229885e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8635462522506714, | |
| "reward_std": 0.028757482767105103, | |
| "rewards/accuracy_reward": 0.8635462522506714, | |
| "rewards/format_reward": 1.0, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.521484375, | |
| "epoch": 2.4022988505747125, | |
| "grad_norm": 1.5279469918158477, | |
| "kl": 0.0458984375, | |
| "learning_rate": 7.597701149425287e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8711223602294922, | |
| "reward_std": 0.022866621613502502, | |
| "rewards/accuracy_reward": 0.8711225390434265, | |
| "rewards/format_reward": 1.0, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.22135925292969, | |
| "epoch": 2.413793103448276, | |
| "grad_norm": 1.9431939650471932, | |
| "kl": 0.04296875, | |
| "learning_rate": 7.586206896551724e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8868978023529053, | |
| "reward_std": 0.02192886732518673, | |
| "rewards/accuracy_reward": 0.8868978023529053, | |
| "rewards/format_reward": 1.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.751953125, | |
| "epoch": 2.425287356321839, | |
| "grad_norm": 1.499449740827804, | |
| "kl": 0.042236328125, | |
| "learning_rate": 7.57471264367816e-07, | |
| "loss": 0.0017, | |
| "reward": 1.865365743637085, | |
| "reward_std": 0.025590822100639343, | |
| "rewards/accuracy_reward": 0.8660167455673218, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.19075775146484, | |
| "epoch": 2.4367816091954024, | |
| "grad_norm": 2.0546219049276684, | |
| "kl": 0.042724609375, | |
| "learning_rate": 7.563218390804598e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8754686117172241, | |
| "reward_std": 0.02179308794438839, | |
| "rewards/accuracy_reward": 0.8754686713218689, | |
| "rewards/format_reward": 1.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.09700775146484, | |
| "epoch": 2.4482758620689653, | |
| "grad_norm": 1.7497621009886284, | |
| "kl": 0.0361328125, | |
| "learning_rate": 7.551724137931034e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8816776275634766, | |
| "reward_std": 0.023087939247488976, | |
| "rewards/accuracy_reward": 0.8816776275634766, | |
| "rewards/format_reward": 1.0, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.85612487792969, | |
| "epoch": 2.4597701149425286, | |
| "grad_norm": 1.6521998481125082, | |
| "kl": 0.037109375, | |
| "learning_rate": 7.540229885057471e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8775031566619873, | |
| "reward_std": 0.022142350673675537, | |
| "rewards/accuracy_reward": 0.8781541585922241, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.697265625, | |
| "epoch": 2.471264367816092, | |
| "grad_norm": 1.4603832529701488, | |
| "kl": 0.037353515625, | |
| "learning_rate": 7.528735632183908e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8763501644134521, | |
| "reward_std": 0.02745872735977173, | |
| "rewards/accuracy_reward": 0.8770012259483337, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.99284362792969, | |
| "epoch": 2.4827586206896552, | |
| "grad_norm": 1.4199974173722163, | |
| "kl": 0.033447265625, | |
| "learning_rate": 7.517241379310344e-07, | |
| "loss": 0.0014, | |
| "reward": 1.879393458366394, | |
| "reward_std": 0.023880278691649437, | |
| "rewards/accuracy_reward": 0.8800445795059204, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.63216400146484, | |
| "epoch": 2.4942528735632186, | |
| "grad_norm": 1.4805312918485005, | |
| "kl": 0.034423828125, | |
| "learning_rate": 7.505747126436781e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8871581554412842, | |
| "reward_std": 0.023221522569656372, | |
| "rewards/accuracy_reward": 0.8871581554412842, | |
| "rewards/format_reward": 1.0, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.69010925292969, | |
| "epoch": 2.5057471264367814, | |
| "grad_norm": 1.5882030215776515, | |
| "kl": 0.031982421875, | |
| "learning_rate": 7.494252873563218e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8449172973632812, | |
| "reward_std": 0.0257129929959774, | |
| "rewards/accuracy_reward": 0.8449174761772156, | |
| "rewards/format_reward": 1.0, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.49089050292969, | |
| "epoch": 2.5172413793103448, | |
| "grad_norm": 3.140150954079358, | |
| "kl": 0.0322265625, | |
| "learning_rate": 7.482758620689655e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8943170309066772, | |
| "reward_std": 0.023106535896658897, | |
| "rewards/accuracy_reward": 0.8949680328369141, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.779296875, | |
| "epoch": 2.528735632183908, | |
| "grad_norm": 1.8149238607460914, | |
| "kl": 0.032958984375, | |
| "learning_rate": 7.471264367816092e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8684821128845215, | |
| "reward_std": 0.023636629804968834, | |
| "rewards/accuracy_reward": 0.8684821128845215, | |
| "rewards/format_reward": 1.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.23503112792969, | |
| "epoch": 2.5402298850574714, | |
| "grad_norm": 2.0933238300012715, | |
| "kl": 0.048095703125, | |
| "learning_rate": 7.459770114942528e-07, | |
| "loss": 0.002, | |
| "reward": 1.8713550567626953, | |
| "reward_std": 0.023653965443372726, | |
| "rewards/accuracy_reward": 0.8713551759719849, | |
| "rewards/format_reward": 1.0, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.32292175292969, | |
| "epoch": 2.5517241379310347, | |
| "grad_norm": 2.40931495027469, | |
| "kl": 0.0311279296875, | |
| "learning_rate": 7.448275862068965e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8844565153121948, | |
| "reward_std": 0.025886138901114464, | |
| "rewards/accuracy_reward": 0.8844565153121948, | |
| "rewards/format_reward": 1.0, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.625, | |
| "epoch": 2.5632183908045976, | |
| "grad_norm": 2.3506903297912234, | |
| "kl": 0.035400390625, | |
| "learning_rate": 7.436781609195402e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8940942287445068, | |
| "reward_std": 0.021925464272499084, | |
| "rewards/accuracy_reward": 0.8940942287445068, | |
| "rewards/format_reward": 1.0, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.16796875, | |
| "epoch": 2.574712643678161, | |
| "grad_norm": 4.228763111501542, | |
| "kl": 0.0302734375, | |
| "learning_rate": 7.425287356321839e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8942646980285645, | |
| "reward_std": 0.02186274155974388, | |
| "rewards/accuracy_reward": 0.8942646980285645, | |
| "rewards/format_reward": 1.0, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.87760925292969, | |
| "epoch": 2.586206896551724, | |
| "grad_norm": 1.339092605600081, | |
| "kl": 0.0286865234375, | |
| "learning_rate": 7.413793103448276e-07, | |
| "loss": 0.0012, | |
| "reward": 1.881058692932129, | |
| "reward_std": 0.025702446699142456, | |
| "rewards/accuracy_reward": 0.8817097544670105, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.62435150146484, | |
| "epoch": 2.5977011494252875, | |
| "grad_norm": 1.770916862954278, | |
| "kl": 0.03125, | |
| "learning_rate": 7.402298850574712e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8780031204223633, | |
| "reward_std": 0.02341182343661785, | |
| "rewards/accuracy_reward": 0.8780032396316528, | |
| "rewards/format_reward": 1.0, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.64453125, | |
| "epoch": 2.609195402298851, | |
| "grad_norm": 1.929796218203904, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 7.390804597701149e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8760128021240234, | |
| "reward_std": 0.025408655405044556, | |
| "rewards/accuracy_reward": 0.8760129809379578, | |
| "rewards/format_reward": 1.0, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.86393737792969, | |
| "epoch": 2.6206896551724137, | |
| "grad_norm": 1.2206148979521507, | |
| "kl": 0.0294189453125, | |
| "learning_rate": 7.379310344827586e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8749685287475586, | |
| "reward_std": 0.02704436704516411, | |
| "rewards/accuracy_reward": 0.8749687075614929, | |
| "rewards/format_reward": 1.0, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.52474212646484, | |
| "epoch": 2.632183908045977, | |
| "grad_norm": 1.1892012690175122, | |
| "kl": 0.03076171875, | |
| "learning_rate": 7.367816091954022e-07, | |
| "loss": 0.0013, | |
| "reward": 1.885100245475769, | |
| "reward_std": 0.02430718205869198, | |
| "rewards/accuracy_reward": 0.8851003646850586, | |
| "rewards/format_reward": 1.0, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.72786712646484, | |
| "epoch": 2.6436781609195403, | |
| "grad_norm": 1.474891820836295, | |
| "kl": 0.03076171875, | |
| "learning_rate": 7.35632183908046e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8827028274536133, | |
| "reward_std": 0.026975825428962708, | |
| "rewards/accuracy_reward": 0.8827028274536133, | |
| "rewards/format_reward": 1.0, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.77083587646484, | |
| "epoch": 2.655172413793103, | |
| "grad_norm": 1.7269926903360489, | |
| "kl": 0.0380859375, | |
| "learning_rate": 7.344827586206897e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8721890449523926, | |
| "reward_std": 0.02634214609861374, | |
| "rewards/accuracy_reward": 0.8721892237663269, | |
| "rewards/format_reward": 1.0, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.22526550292969, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 1.9439461898517982, | |
| "kl": 0.0322265625, | |
| "learning_rate": 7.333333333333332e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8815977573394775, | |
| "reward_std": 0.027597349137067795, | |
| "rewards/accuracy_reward": 0.8822487592697144, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.474609375, | |
| "epoch": 2.67816091954023, | |
| "grad_norm": 1.4550598569752655, | |
| "kl": 0.03173828125, | |
| "learning_rate": 7.32183908045977e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8949522972106934, | |
| "reward_std": 0.023627810180187225, | |
| "rewards/accuracy_reward": 0.8949524164199829, | |
| "rewards/format_reward": 1.0, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.77214050292969, | |
| "epoch": 2.689655172413793, | |
| "grad_norm": 1.231833875421197, | |
| "kl": 0.033447265625, | |
| "learning_rate": 7.310344827586207e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8845272064208984, | |
| "reward_std": 0.02502075955271721, | |
| "rewards/accuracy_reward": 0.8845272064208984, | |
| "rewards/format_reward": 1.0, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.39974212646484, | |
| "epoch": 2.7011494252873565, | |
| "grad_norm": 1.3652986563130975, | |
| "kl": 0.035400390625, | |
| "learning_rate": 7.298850574712644e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8603239059448242, | |
| "reward_std": 0.0263795405626297, | |
| "rewards/accuracy_reward": 0.8603239059448242, | |
| "rewards/format_reward": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.92057800292969, | |
| "epoch": 2.7126436781609193, | |
| "grad_norm": 1.6201244118842026, | |
| "kl": 0.038818359375, | |
| "learning_rate": 7.28735632183908e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8918843269348145, | |
| "reward_std": 0.0242290198802948, | |
| "rewards/accuracy_reward": 0.891884446144104, | |
| "rewards/format_reward": 1.0, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.63346862792969, | |
| "epoch": 2.7241379310344827, | |
| "grad_norm": 1.6072202456697025, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.275862068965517e-07, | |
| "loss": 0.0016, | |
| "reward": 1.891068458557129, | |
| "reward_std": 0.025224387645721436, | |
| "rewards/accuracy_reward": 0.8917193412780762, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.85807800292969, | |
| "epoch": 2.735632183908046, | |
| "grad_norm": 3.1474234625781308, | |
| "kl": 0.0390625, | |
| "learning_rate": 7.264367816091954e-07, | |
| "loss": 0.0016, | |
| "reward": 1.886623740196228, | |
| "reward_std": 0.024830807000398636, | |
| "rewards/accuracy_reward": 0.8866235613822937, | |
| "rewards/format_reward": 1.0, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.75521087646484, | |
| "epoch": 2.7471264367816093, | |
| "grad_norm": 1.6342951016344707, | |
| "kl": 0.0419921875, | |
| "learning_rate": 7.25287356321839e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8781700134277344, | |
| "reward_std": 0.02809235453605652, | |
| "rewards/accuracy_reward": 0.8794721364974976, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.28255462646484, | |
| "epoch": 2.7586206896551726, | |
| "grad_norm": 1.1515713105906038, | |
| "kl": 0.036865234375, | |
| "learning_rate": 7.241379310344827e-07, | |
| "loss": 0.0015, | |
| "reward": 1.89352548122406, | |
| "reward_std": 0.021531865000724792, | |
| "rewards/accuracy_reward": 0.8935256004333496, | |
| "rewards/format_reward": 1.0, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.35482025146484, | |
| "epoch": 2.7701149425287355, | |
| "grad_norm": 2.045990886967949, | |
| "kl": 0.045166015625, | |
| "learning_rate": 7.229885057471265e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8543777465820312, | |
| "reward_std": 0.028905829414725304, | |
| "rewards/accuracy_reward": 0.8556797504425049, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.45573425292969, | |
| "epoch": 2.781609195402299, | |
| "grad_norm": 2.388872547648165, | |
| "kl": 0.03955078125, | |
| "learning_rate": 7.2183908045977e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8795366287231445, | |
| "reward_std": 0.025653596967458725, | |
| "rewards/accuracy_reward": 0.8801875114440918, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.02799987792969, | |
| "epoch": 2.793103448275862, | |
| "grad_norm": 1.313601894213018, | |
| "kl": 0.041015625, | |
| "learning_rate": 7.206896551724138e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9008476734161377, | |
| "reward_std": 0.02234504744410515, | |
| "rewards/accuracy_reward": 0.9008476734161377, | |
| "rewards/format_reward": 1.0, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.38802337646484, | |
| "epoch": 2.8045977011494254, | |
| "grad_norm": 1.690388570174564, | |
| "kl": 0.039794921875, | |
| "learning_rate": 7.195402298850575e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8841087818145752, | |
| "reward_std": 0.02542022429406643, | |
| "rewards/accuracy_reward": 0.8854107856750488, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.02409362792969, | |
| "epoch": 2.8160919540229887, | |
| "grad_norm": 1.4752619364083788, | |
| "kl": 0.038330078125, | |
| "learning_rate": 7.18390804597701e-07, | |
| "loss": 0.0016, | |
| "reward": 1.872473120689392, | |
| "reward_std": 0.026318645104765892, | |
| "rewards/accuracy_reward": 0.8724731206893921, | |
| "rewards/format_reward": 1.0, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.24674987792969, | |
| "epoch": 2.8275862068965516, | |
| "grad_norm": 13.670541177628872, | |
| "kl": 0.036376953125, | |
| "learning_rate": 7.172413793103448e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8723421096801758, | |
| "reward_std": 0.025184884667396545, | |
| "rewards/accuracy_reward": 0.8723421096801758, | |
| "rewards/format_reward": 1.0, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.20638275146484, | |
| "epoch": 2.839080459770115, | |
| "grad_norm": 1.0940987098177108, | |
| "kl": 0.03515625, | |
| "learning_rate": 7.160919540229885e-07, | |
| "loss": 0.0015, | |
| "reward": 1.875187873840332, | |
| "reward_std": 0.02431398257613182, | |
| "rewards/accuracy_reward": 0.8751880526542664, | |
| "rewards/format_reward": 1.0, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.302734375, | |
| "epoch": 2.8505747126436782, | |
| "grad_norm": 2.079946913833384, | |
| "kl": 0.052490234375, | |
| "learning_rate": 7.149425287356321e-07, | |
| "loss": 0.0022, | |
| "reward": 1.887062430381775, | |
| "reward_std": 0.024545643478631973, | |
| "rewards/accuracy_reward": 0.8870624303817749, | |
| "rewards/format_reward": 1.0, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.73503112792969, | |
| "epoch": 2.862068965517241, | |
| "grad_norm": 1.9116339472111843, | |
| "kl": 0.033935546875, | |
| "learning_rate": 7.137931034482758e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8704020977020264, | |
| "reward_std": 0.02490667998790741, | |
| "rewards/accuracy_reward": 0.8710530996322632, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.78255462646484, | |
| "epoch": 2.873563218390805, | |
| "grad_norm": 1.198578783738147, | |
| "kl": 0.035400390625, | |
| "learning_rate": 7.126436781609195e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8799991607666016, | |
| "reward_std": 0.022041937336325645, | |
| "rewards/accuracy_reward": 0.8799993395805359, | |
| "rewards/format_reward": 1.0, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.40755462646484, | |
| "epoch": 2.8850574712643677, | |
| "grad_norm": 1.8122950855537878, | |
| "kl": 0.037353515625, | |
| "learning_rate": 7.114942528735633e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8816900253295898, | |
| "reward_std": 0.02312072180211544, | |
| "rewards/accuracy_reward": 0.8816901445388794, | |
| "rewards/format_reward": 1.0, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.2578125, | |
| "epoch": 2.896551724137931, | |
| "grad_norm": 1.8584979960733188, | |
| "kl": 0.03369140625, | |
| "learning_rate": 7.103448275862068e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8846673965454102, | |
| "reward_std": 0.025826361030340195, | |
| "rewards/accuracy_reward": 0.8846673965454102, | |
| "rewards/format_reward": 1.0, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.16796875, | |
| "epoch": 2.9080459770114944, | |
| "grad_norm": 3.6080553707797423, | |
| "kl": 0.033447265625, | |
| "learning_rate": 7.091954022988506e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8778409957885742, | |
| "reward_std": 0.02433183044195175, | |
| "rewards/accuracy_reward": 0.8778411746025085, | |
| "rewards/format_reward": 1.0, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.78776550292969, | |
| "epoch": 2.9195402298850572, | |
| "grad_norm": 1.4145733069699882, | |
| "kl": 0.037109375, | |
| "learning_rate": 7.080459770114943e-07, | |
| "loss": 0.0015, | |
| "reward": 1.881535291671753, | |
| "reward_std": 0.023436537012457848, | |
| "rewards/accuracy_reward": 0.8815354108810425, | |
| "rewards/format_reward": 1.0, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.01237487792969, | |
| "epoch": 2.9310344827586206, | |
| "grad_norm": 1.3798889630920503, | |
| "kl": 0.037109375, | |
| "learning_rate": 7.068965517241378e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8669624328613281, | |
| "reward_std": 0.024586662650108337, | |
| "rewards/accuracy_reward": 0.8669624328613281, | |
| "rewards/format_reward": 1.0, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.84049987792969, | |
| "epoch": 2.942528735632184, | |
| "grad_norm": 1.501972491549555, | |
| "kl": 0.033935546875, | |
| "learning_rate": 7.057471264367816e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8706506490707397, | |
| "reward_std": 0.02558681182563305, | |
| "rewards/accuracy_reward": 0.8706506490707397, | |
| "rewards/format_reward": 1.0, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.26692962646484, | |
| "epoch": 2.954022988505747, | |
| "grad_norm": 1.4790939200318616, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.045977011494253e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8889020681381226, | |
| "reward_std": 0.02365148440003395, | |
| "rewards/accuracy_reward": 0.888901948928833, | |
| "rewards/format_reward": 1.0, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.91146087646484, | |
| "epoch": 2.9655172413793105, | |
| "grad_norm": 3.7173905554951854, | |
| "kl": 0.0380859375, | |
| "learning_rate": 7.034482758620688e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8991615772247314, | |
| "reward_std": 0.01971290074288845, | |
| "rewards/accuracy_reward": 0.8991615772247314, | |
| "rewards/format_reward": 1.0, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.97917175292969, | |
| "epoch": 2.9770114942528734, | |
| "grad_norm": 1.2829606539449347, | |
| "kl": 0.03564453125, | |
| "learning_rate": 7.022988505747126e-07, | |
| "loss": 0.0015, | |
| "reward": 1.88856840133667, | |
| "reward_std": 0.02619217149913311, | |
| "rewards/accuracy_reward": 0.8898705840110779, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.66471862792969, | |
| "epoch": 2.9885057471264367, | |
| "grad_norm": 1.470317712158937, | |
| "kl": 0.03759765625, | |
| "learning_rate": 7.011494252873563e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8722280263900757, | |
| "reward_std": 0.022509008646011353, | |
| "rewards/accuracy_reward": 0.8722281455993652, | |
| "rewards/format_reward": 1.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 96.02809143066406, | |
| "epoch": 3.0, | |
| "grad_norm": 3.0919770267204147, | |
| "kl": 0.03466796875, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8606176376342773, | |
| "reward_std": 0.022723043337464333, | |
| "rewards/accuracy_reward": 0.8606176376342773, | |
| "rewards/format_reward": 1.0, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.25260925292969, | |
| "epoch": 3.0114942528735633, | |
| "grad_norm": 2.7419615747949386, | |
| "kl": 0.034912109375, | |
| "learning_rate": 6.988505747126436e-07, | |
| "loss": 0.0014, | |
| "reward": 1.869284987449646, | |
| "reward_std": 0.024234648793935776, | |
| "rewards/accuracy_reward": 0.8692850470542908, | |
| "rewards/format_reward": 1.0, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.68359375, | |
| "epoch": 3.0229885057471266, | |
| "grad_norm": 1.559372722176547, | |
| "kl": 0.03271484375, | |
| "learning_rate": 6.977011494252873e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8653929233551025, | |
| "reward_std": 0.02791503071784973, | |
| "rewards/accuracy_reward": 0.8660439252853394, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.7109375, | |
| "epoch": 3.0344827586206895, | |
| "grad_norm": 2.1221245906508464, | |
| "kl": 0.033203125, | |
| "learning_rate": 6.96551724137931e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8776719570159912, | |
| "reward_std": 0.023673653602600098, | |
| "rewards/accuracy_reward": 0.8776720762252808, | |
| "rewards/format_reward": 1.0, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.60872650146484, | |
| "epoch": 3.045977011494253, | |
| "grad_norm": 1.6002601391217546, | |
| "kl": 0.03662109375, | |
| "learning_rate": 6.954022988505746e-07, | |
| "loss": 0.0015, | |
| "reward": 1.830482840538025, | |
| "reward_std": 0.02900915965437889, | |
| "rewards/accuracy_reward": 0.8304829597473145, | |
| "rewards/format_reward": 1.0, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.30794525146484, | |
| "epoch": 3.057471264367816, | |
| "grad_norm": 1.6434905190427025, | |
| "kl": 0.035888671875, | |
| "learning_rate": 6.942528735632184e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8308700323104858, | |
| "reward_std": 0.02639869973063469, | |
| "rewards/accuracy_reward": 0.8315210342407227, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.29167175292969, | |
| "epoch": 3.0689655172413794, | |
| "grad_norm": 1.027886396009367, | |
| "kl": 0.03466796875, | |
| "learning_rate": 6.931034482758621e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8851213455200195, | |
| "reward_std": 0.021387819200754166, | |
| "rewards/accuracy_reward": 0.8851213455200195, | |
| "rewards/format_reward": 1.0, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.84700775146484, | |
| "epoch": 3.0804597701149423, | |
| "grad_norm": 1.5206328556554167, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.919540229885057e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8900096416473389, | |
| "reward_std": 0.02050493285059929, | |
| "rewards/accuracy_reward": 0.8900095820426941, | |
| "rewards/format_reward": 1.0, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.73177337646484, | |
| "epoch": 3.0919540229885056, | |
| "grad_norm": 1.1641802469606335, | |
| "kl": 0.034423828125, | |
| "learning_rate": 6.908045977011494e-07, | |
| "loss": 0.0015, | |
| "reward": 1.858663558959961, | |
| "reward_std": 0.025883881375193596, | |
| "rewards/accuracy_reward": 0.8593146800994873, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.19140625, | |
| "epoch": 3.103448275862069, | |
| "grad_norm": 1.332286209838367, | |
| "kl": 0.03271484375, | |
| "learning_rate": 6.896551724137931e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8819489479064941, | |
| "reward_std": 0.023023171350359917, | |
| "rewards/accuracy_reward": 0.8819491267204285, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.453125, | |
| "epoch": 3.1149425287356323, | |
| "grad_norm": 1.3705118964619707, | |
| "kl": 0.0341796875, | |
| "learning_rate": 6.885057471264368e-07, | |
| "loss": 0.0015, | |
| "reward": 1.858891248703003, | |
| "reward_std": 0.022298548370599747, | |
| "rewards/accuracy_reward": 0.8595423698425293, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.52278900146484, | |
| "epoch": 3.1264367816091956, | |
| "grad_norm": 1.5377438314997252, | |
| "kl": 0.041015625, | |
| "learning_rate": 6.873563218390804e-07, | |
| "loss": 0.0017, | |
| "reward": 1.884607195854187, | |
| "reward_std": 0.024785785004496574, | |
| "rewards/accuracy_reward": 0.8852583169937134, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.56380462646484, | |
| "epoch": 3.1379310344827585, | |
| "grad_norm": 1.9722987101278098, | |
| "kl": 0.03515625, | |
| "learning_rate": 6.862068965517241e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8918275833129883, | |
| "reward_std": 0.021715257316827774, | |
| "rewards/accuracy_reward": 0.8918277621269226, | |
| "rewards/format_reward": 1.0, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.19792175292969, | |
| "epoch": 3.1494252873563218, | |
| "grad_norm": 1.5898347428804778, | |
| "kl": 0.03125, | |
| "learning_rate": 6.850574712643678e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8759284019470215, | |
| "reward_std": 0.025104787200689316, | |
| "rewards/accuracy_reward": 0.8765794634819031, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.07747650146484, | |
| "epoch": 3.160919540229885, | |
| "grad_norm": 1.6076437000195318, | |
| "kl": 0.0380859375, | |
| "learning_rate": 6.839080459770114e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8818080425262451, | |
| "reward_std": 0.02497541531920433, | |
| "rewards/accuracy_reward": 0.8824591040611267, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.35546875, | |
| "epoch": 3.1724137931034484, | |
| "grad_norm": 1.7685732057058994, | |
| "kl": 0.039306640625, | |
| "learning_rate": 6.827586206896552e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8782709836959839, | |
| "reward_std": 0.025368181988596916, | |
| "rewards/accuracy_reward": 0.8789221048355103, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.39128112792969, | |
| "epoch": 3.1839080459770113, | |
| "grad_norm": 4.02053655146932, | |
| "kl": 0.04150390625, | |
| "learning_rate": 6.816091954022988e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8910026550292969, | |
| "reward_std": 0.022438380867242813, | |
| "rewards/accuracy_reward": 0.8910026550292969, | |
| "rewards/format_reward": 1.0, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.00130462646484, | |
| "epoch": 3.1954022988505746, | |
| "grad_norm": 1.2219545665192806, | |
| "kl": 0.033935546875, | |
| "learning_rate": 6.804597701149425e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8810728788375854, | |
| "reward_std": 0.027784962207078934, | |
| "rewards/accuracy_reward": 0.8830260038375854, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.451171875, | |
| "epoch": 3.206896551724138, | |
| "grad_norm": 1.7436462195514546, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.793103448275862e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8771369457244873, | |
| "reward_std": 0.023900484666228294, | |
| "rewards/accuracy_reward": 0.8771368861198425, | |
| "rewards/format_reward": 1.0, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.857421875, | |
| "epoch": 3.218390804597701, | |
| "grad_norm": 1.564180567510325, | |
| "kl": 0.0390625, | |
| "learning_rate": 6.781609195402298e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8888477087020874, | |
| "reward_std": 0.02220313809812069, | |
| "rewards/accuracy_reward": 0.8888477683067322, | |
| "rewards/format_reward": 1.0, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.80924987792969, | |
| "epoch": 3.2298850574712645, | |
| "grad_norm": 4.146116713241933, | |
| "kl": 0.040771484375, | |
| "learning_rate": 6.770114942528736e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8889455795288086, | |
| "reward_std": 0.021025802940130234, | |
| "rewards/accuracy_reward": 0.8889455795288086, | |
| "rewards/format_reward": 1.0, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.03450775146484, | |
| "epoch": 3.2413793103448274, | |
| "grad_norm": 2.0093993818021167, | |
| "kl": 0.037841796875, | |
| "learning_rate": 6.758620689655172e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8997611999511719, | |
| "reward_std": 0.022101037204265594, | |
| "rewards/accuracy_reward": 0.8997613191604614, | |
| "rewards/format_reward": 1.0, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.27734375, | |
| "epoch": 3.2528735632183907, | |
| "grad_norm": 1.7252038523721087, | |
| "kl": 0.043212890625, | |
| "learning_rate": 6.747126436781609e-07, | |
| "loss": 0.0018, | |
| "reward": 1.893257975578308, | |
| "reward_std": 0.024223104119300842, | |
| "rewards/accuracy_reward": 0.8945600986480713, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.23177337646484, | |
| "epoch": 3.264367816091954, | |
| "grad_norm": 1.3254878153740217, | |
| "kl": 0.03955078125, | |
| "learning_rate": 6.735632183908046e-07, | |
| "loss": 0.0017, | |
| "reward": 1.897439956665039, | |
| "reward_std": 0.021015014499425888, | |
| "rewards/accuracy_reward": 0.8974398374557495, | |
| "rewards/format_reward": 1.0, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.99609375, | |
| "epoch": 3.2758620689655173, | |
| "grad_norm": 1.5719828379459555, | |
| "kl": 0.03955078125, | |
| "learning_rate": 6.724137931034482e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8719691038131714, | |
| "reward_std": 0.022564705461263657, | |
| "rewards/accuracy_reward": 0.8719691038131714, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.23372650146484, | |
| "epoch": 3.2873563218390807, | |
| "grad_norm": 1.1121764344089415, | |
| "kl": 0.04052734375, | |
| "learning_rate": 6.71264367816092e-07, | |
| "loss": 0.0017, | |
| "reward": 1.88375985622406, | |
| "reward_std": 0.022753890603780746, | |
| "rewards/accuracy_reward": 0.8837599158287048, | |
| "rewards/format_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.83724212646484, | |
| "epoch": 3.2988505747126435, | |
| "grad_norm": 4.799405333071082, | |
| "kl": 0.04296875, | |
| "learning_rate": 6.701149425287356e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8786205053329468, | |
| "reward_std": 0.025325840339064598, | |
| "rewards/accuracy_reward": 0.8799225687980652, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.91536712646484, | |
| "epoch": 3.310344827586207, | |
| "grad_norm": 2.021051884647491, | |
| "kl": 0.04345703125, | |
| "learning_rate": 6.689655172413793e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8967899084091187, | |
| "reward_std": 0.019771821796894073, | |
| "rewards/accuracy_reward": 0.8967899084091187, | |
| "rewards/format_reward": 1.0, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.49544525146484, | |
| "epoch": 3.32183908045977, | |
| "grad_norm": 1.9020287418215316, | |
| "kl": 0.04052734375, | |
| "learning_rate": 6.67816091954023e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8606189489364624, | |
| "reward_std": 0.025568749755620956, | |
| "rewards/accuracy_reward": 0.8619210720062256, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.46745300292969, | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 1.2275084362530455, | |
| "kl": 0.041015625, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8807505369186401, | |
| "reward_std": 0.02470255456864834, | |
| "rewards/accuracy_reward": 0.8807506561279297, | |
| "rewards/format_reward": 1.0, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.484375, | |
| "epoch": 3.344827586206897, | |
| "grad_norm": 1.5137610627678855, | |
| "kl": 0.036376953125, | |
| "learning_rate": 6.655172413793103e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8718066215515137, | |
| "reward_std": 0.025618024170398712, | |
| "rewards/accuracy_reward": 0.8718067407608032, | |
| "rewards/format_reward": 1.0, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.90234375, | |
| "epoch": 3.3563218390804597, | |
| "grad_norm": 3.2723373427710034, | |
| "kl": 0.037109375, | |
| "learning_rate": 6.64367816091954e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8626410961151123, | |
| "reward_std": 0.02414146065711975, | |
| "rewards/accuracy_reward": 0.8632920980453491, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.763671875, | |
| "epoch": 3.367816091954023, | |
| "grad_norm": 1.4200196313502904, | |
| "kl": 0.042724609375, | |
| "learning_rate": 6.632183908045976e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8814753293991089, | |
| "reward_std": 0.02261793240904808, | |
| "rewards/accuracy_reward": 0.8814753293991089, | |
| "rewards/format_reward": 1.0, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.11849212646484, | |
| "epoch": 3.3793103448275863, | |
| "grad_norm": 2.2801476286690554, | |
| "kl": 0.038330078125, | |
| "learning_rate": 6.620689655172414e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8842586278915405, | |
| "reward_std": 0.026064470410346985, | |
| "rewards/accuracy_reward": 0.8842586874961853, | |
| "rewards/format_reward": 1.0, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.70052337646484, | |
| "epoch": 3.3908045977011496, | |
| "grad_norm": 1.1324912574969757, | |
| "kl": 0.0390625, | |
| "learning_rate": 6.609195402298851e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8832792043685913, | |
| "reward_std": 0.02494307979941368, | |
| "rewards/accuracy_reward": 0.8832792043685913, | |
| "rewards/format_reward": 1.0, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.57682800292969, | |
| "epoch": 3.4022988505747125, | |
| "grad_norm": 1.8918486699769739, | |
| "kl": 0.037841796875, | |
| "learning_rate": 6.597701149425286e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8709214925765991, | |
| "reward_std": 0.02543533965945244, | |
| "rewards/accuracy_reward": 0.8709214925765991, | |
| "rewards/format_reward": 1.0, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.50911712646484, | |
| "epoch": 3.413793103448276, | |
| "grad_norm": 3.3366246111039506, | |
| "kl": 0.036865234375, | |
| "learning_rate": 6.586206896551724e-07, | |
| "loss": 0.0015, | |
| "reward": 1.9022067785263062, | |
| "reward_std": 0.02121621184051037, | |
| "rewards/accuracy_reward": 0.9022065997123718, | |
| "rewards/format_reward": 1.0, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.03060150146484, | |
| "epoch": 3.425287356321839, | |
| "grad_norm": 1.784585488176211, | |
| "kl": 0.0380859375, | |
| "learning_rate": 6.574712643678161e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8803653717041016, | |
| "reward_std": 0.02356140874326229, | |
| "rewards/accuracy_reward": 0.8810164332389832, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.619140625, | |
| "epoch": 3.4367816091954024, | |
| "grad_norm": 1.799334133513421, | |
| "kl": 0.039306640625, | |
| "learning_rate": 6.563218390804598e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8940705060958862, | |
| "reward_std": 0.023154854774475098, | |
| "rewards/accuracy_reward": 0.8940703272819519, | |
| "rewards/format_reward": 1.0, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.240234375, | |
| "epoch": 3.4482758620689653, | |
| "grad_norm": 1.5600328539462516, | |
| "kl": 0.037841796875, | |
| "learning_rate": 6.551724137931034e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8870646953582764, | |
| "reward_std": 0.021850477904081345, | |
| "rewards/accuracy_reward": 0.8870646357536316, | |
| "rewards/format_reward": 1.0, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82682800292969, | |
| "epoch": 3.4597701149425286, | |
| "grad_norm": 1.3183868361300648, | |
| "kl": 0.041259765625, | |
| "learning_rate": 6.540229885057471e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9015138149261475, | |
| "reward_std": 0.02374776266515255, | |
| "rewards/accuracy_reward": 0.9015137553215027, | |
| "rewards/format_reward": 1.0, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.62109375, | |
| "epoch": 3.471264367816092, | |
| "grad_norm": 1.7939569716400035, | |
| "kl": 0.04052734375, | |
| "learning_rate": 6.528735632183908e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8765311241149902, | |
| "reward_std": 0.023926347494125366, | |
| "rewards/accuracy_reward": 0.8765311241149902, | |
| "rewards/format_reward": 1.0, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.55143737792969, | |
| "epoch": 3.4827586206896552, | |
| "grad_norm": 1.4131833430934648, | |
| "kl": 0.04150390625, | |
| "learning_rate": 6.517241379310344e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8952958583831787, | |
| "reward_std": 0.021189194172620773, | |
| "rewards/accuracy_reward": 0.8952958583831787, | |
| "rewards/format_reward": 1.0, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.07421875, | |
| "epoch": 3.4942528735632186, | |
| "grad_norm": 2.76037518653071, | |
| "kl": 0.039794921875, | |
| "learning_rate": 6.505747126436782e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8874156475067139, | |
| "reward_std": 0.024307802319526672, | |
| "rewards/accuracy_reward": 0.8874154090881348, | |
| "rewards/format_reward": 1.0, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.955078125, | |
| "epoch": 3.5057471264367814, | |
| "grad_norm": 2.1138139550456376, | |
| "kl": 0.04150390625, | |
| "learning_rate": 6.494252873563219e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8822648525238037, | |
| "reward_std": 0.023932967334985733, | |
| "rewards/accuracy_reward": 0.8822647929191589, | |
| "rewards/format_reward": 1.0, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.31575775146484, | |
| "epoch": 3.5172413793103448, | |
| "grad_norm": 1.2027549538193985, | |
| "kl": 0.044921875, | |
| "learning_rate": 6.482758620689654e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8690357208251953, | |
| "reward_std": 0.02493324503302574, | |
| "rewards/accuracy_reward": 0.8696866035461426, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.220703125, | |
| "epoch": 3.528735632183908, | |
| "grad_norm": 3.667217381252123, | |
| "kl": 0.042236328125, | |
| "learning_rate": 6.471264367816092e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9012823104858398, | |
| "reward_std": 0.021280398592352867, | |
| "rewards/accuracy_reward": 0.9012823104858398, | |
| "rewards/format_reward": 1.0, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.03646087646484, | |
| "epoch": 3.5402298850574714, | |
| "grad_norm": 1.3771503535932048, | |
| "kl": 0.044189453125, | |
| "learning_rate": 6.459770114942529e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8968042135238647, | |
| "reward_std": 0.0209085401147604, | |
| "rewards/accuracy_reward": 0.8968044519424438, | |
| "rewards/format_reward": 1.0, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.03841400146484, | |
| "epoch": 3.5517241379310347, | |
| "grad_norm": 2.601418631517434, | |
| "kl": 0.041748046875, | |
| "learning_rate": 6.448275862068964e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8753538131713867, | |
| "reward_std": 0.024225711822509766, | |
| "rewards/accuracy_reward": 0.8760050535202026, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.80403900146484, | |
| "epoch": 3.5632183908045976, | |
| "grad_norm": 1.5004741067737122, | |
| "kl": 0.0419921875, | |
| "learning_rate": 6.436781609195402e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8968862295150757, | |
| "reward_std": 0.019103027880191803, | |
| "rewards/accuracy_reward": 0.8968861103057861, | |
| "rewards/format_reward": 1.0, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.68685150146484, | |
| "epoch": 3.574712643678161, | |
| "grad_norm": 1.788362217489612, | |
| "kl": 0.040771484375, | |
| "learning_rate": 6.425287356321839e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8962510824203491, | |
| "reward_std": 0.02111215889453888, | |
| "rewards/accuracy_reward": 0.8962510824203491, | |
| "rewards/format_reward": 1.0, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.00260925292969, | |
| "epoch": 3.586206896551724, | |
| "grad_norm": 1.789359418846744, | |
| "kl": 0.042724609375, | |
| "learning_rate": 6.413793103448275e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8833904266357422, | |
| "reward_std": 0.019987134262919426, | |
| "rewards/accuracy_reward": 0.8833904266357422, | |
| "rewards/format_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.673828125, | |
| "epoch": 3.5977011494252875, | |
| "grad_norm": 4.249206728495501, | |
| "kl": 0.03857421875, | |
| "learning_rate": 6.402298850574712e-07, | |
| "loss": 0.0016, | |
| "reward": 1.891036868095398, | |
| "reward_std": 0.020312292501330376, | |
| "rewards/accuracy_reward": 0.8916880488395691, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.59375, | |
| "epoch": 3.609195402298851, | |
| "grad_norm": 4.978124356714563, | |
| "kl": 0.03955078125, | |
| "learning_rate": 6.390804597701149e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8778576850891113, | |
| "reward_std": 0.021446645259857178, | |
| "rewards/accuracy_reward": 0.8785087466239929, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.49349212646484, | |
| "epoch": 3.6206896551724137, | |
| "grad_norm": 1.5867170646844477, | |
| "kl": 0.037353515625, | |
| "learning_rate": 6.379310344827587e-07, | |
| "loss": 0.0016, | |
| "reward": 1.898721694946289, | |
| "reward_std": 0.020030900835990906, | |
| "rewards/accuracy_reward": 0.8987216949462891, | |
| "rewards/format_reward": 1.0, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.83659362792969, | |
| "epoch": 3.632183908045977, | |
| "grad_norm": 1.873279997651989, | |
| "kl": 0.041259765625, | |
| "learning_rate": 6.367816091954022e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8622578382492065, | |
| "reward_std": 0.022315729409456253, | |
| "rewards/accuracy_reward": 0.8629088401794434, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.33464050292969, | |
| "epoch": 3.6436781609195403, | |
| "grad_norm": 3.2650242705040964, | |
| "kl": 0.041748046875, | |
| "learning_rate": 6.35632183908046e-07, | |
| "loss": 0.0017, | |
| "reward": 1.886378288269043, | |
| "reward_std": 0.019871540367603302, | |
| "rewards/accuracy_reward": 0.8863782286643982, | |
| "rewards/format_reward": 1.0, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.236328125, | |
| "epoch": 3.655172413793103, | |
| "grad_norm": 2.4548899206996175, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.344827586206897e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8966128826141357, | |
| "reward_std": 0.023452280089259148, | |
| "rewards/accuracy_reward": 0.896612823009491, | |
| "rewards/format_reward": 1.0, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.52409362792969, | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 1.753082960789312, | |
| "kl": 0.042236328125, | |
| "learning_rate": 6.333333333333332e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8872454166412354, | |
| "reward_std": 0.02262219414114952, | |
| "rewards/accuracy_reward": 0.8878964185714722, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.630859375, | |
| "epoch": 3.67816091954023, | |
| "grad_norm": 2.176188734436804, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.32183908045977e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8845866918563843, | |
| "reward_std": 0.02126806601881981, | |
| "rewards/accuracy_reward": 0.8845868110656738, | |
| "rewards/format_reward": 1.0, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.69140625, | |
| "epoch": 3.689655172413793, | |
| "grad_norm": 1.5506807446234716, | |
| "kl": 0.0390625, | |
| "learning_rate": 6.310344827586207e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8919544219970703, | |
| "reward_std": 0.02176390215754509, | |
| "rewards/accuracy_reward": 0.8926056623458862, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.40234375, | |
| "epoch": 3.7011494252873565, | |
| "grad_norm": 7.014431541777074, | |
| "kl": 0.03662109375, | |
| "learning_rate": 6.298850574712643e-07, | |
| "loss": 0.0015, | |
| "reward": 1.895882487297058, | |
| "reward_std": 0.020401567220687866, | |
| "rewards/accuracy_reward": 0.8958825469017029, | |
| "rewards/format_reward": 1.0, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.291015625, | |
| "epoch": 3.7126436781609193, | |
| "grad_norm": 2.3982992480163206, | |
| "kl": 0.041259765625, | |
| "learning_rate": 6.28735632183908e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8751071691513062, | |
| "reward_std": 0.02190619520843029, | |
| "rewards/accuracy_reward": 0.8751071691513062, | |
| "rewards/format_reward": 1.0, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.45638275146484, | |
| "epoch": 3.7241379310344827, | |
| "grad_norm": 1.903714474914862, | |
| "kl": 0.043212890625, | |
| "learning_rate": 6.275862068965517e-07, | |
| "loss": 0.0018, | |
| "reward": 1.883380651473999, | |
| "reward_std": 0.02156060002744198, | |
| "rewards/accuracy_reward": 0.8840314745903015, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.33724212646484, | |
| "epoch": 3.735632183908046, | |
| "grad_norm": 3.392518970641651, | |
| "kl": 0.03955078125, | |
| "learning_rate": 6.264367816091954e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8731813430786133, | |
| "reward_std": 0.022301897406578064, | |
| "rewards/accuracy_reward": 0.8738325834274292, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.10221862792969, | |
| "epoch": 3.7471264367816093, | |
| "grad_norm": 2.4944444297866886, | |
| "kl": 0.0400390625, | |
| "learning_rate": 6.25287356321839e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8849986791610718, | |
| "reward_std": 0.025180388242006302, | |
| "rewards/accuracy_reward": 0.8863007426261902, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.51692962646484, | |
| "epoch": 3.7586206896551726, | |
| "grad_norm": 1.5524974855638687, | |
| "kl": 0.048095703125, | |
| "learning_rate": 6.241379310344828e-07, | |
| "loss": 0.002, | |
| "reward": 1.8775360584259033, | |
| "reward_std": 0.020194731652736664, | |
| "rewards/accuracy_reward": 0.8781870603561401, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.32487487792969, | |
| "epoch": 3.7701149425287355, | |
| "grad_norm": 1.4924502520471505, | |
| "kl": 0.044189453125, | |
| "learning_rate": 6.229885057471264e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8867871761322021, | |
| "reward_std": 0.02044479362666607, | |
| "rewards/accuracy_reward": 0.8874382972717285, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.1875, | |
| "epoch": 3.781609195402299, | |
| "grad_norm": 2.570142758125839, | |
| "kl": 0.04248046875, | |
| "learning_rate": 6.2183908045977e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8741064071655273, | |
| "reward_std": 0.019452206790447235, | |
| "rewards/accuracy_reward": 0.8741063475608826, | |
| "rewards/format_reward": 1.0, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.177734375, | |
| "epoch": 3.793103448275862, | |
| "grad_norm": 1.638699330431456, | |
| "kl": 0.048583984375, | |
| "learning_rate": 6.206896551724138e-07, | |
| "loss": 0.002, | |
| "reward": 1.883441686630249, | |
| "reward_std": 0.021484442055225372, | |
| "rewards/accuracy_reward": 0.8847436904907227, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.916015625, | |
| "epoch": 3.8045977011494254, | |
| "grad_norm": 1.4709865788564034, | |
| "kl": 0.041015625, | |
| "learning_rate": 6.195402298850575e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8892269134521484, | |
| "reward_std": 0.019907724112272263, | |
| "rewards/accuracy_reward": 0.8892270922660828, | |
| "rewards/format_reward": 1.0, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.96224212646484, | |
| "epoch": 3.8160919540229887, | |
| "grad_norm": 3.682456578081511, | |
| "kl": 0.044189453125, | |
| "learning_rate": 6.183908045977011e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8904244899749756, | |
| "reward_std": 0.017542677000164986, | |
| "rewards/accuracy_reward": 0.8904244303703308, | |
| "rewards/format_reward": 1.0, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.03385925292969, | |
| "epoch": 3.8275862068965516, | |
| "grad_norm": 1.4011409988409274, | |
| "kl": 0.042724609375, | |
| "learning_rate": 6.172413793103448e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8981231451034546, | |
| "reward_std": 0.019530044868588448, | |
| "rewards/accuracy_reward": 0.8981232047080994, | |
| "rewards/format_reward": 1.0, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.95052337646484, | |
| "epoch": 3.839080459770115, | |
| "grad_norm": 2.57360289310368, | |
| "kl": 0.04296875, | |
| "learning_rate": 6.160919540229885e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8822907209396362, | |
| "reward_std": 0.020302042365074158, | |
| "rewards/accuracy_reward": 0.882290780544281, | |
| "rewards/format_reward": 1.0, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.95964050292969, | |
| "epoch": 3.8505747126436782, | |
| "grad_norm": 3.9665298261234136, | |
| "kl": 0.04833984375, | |
| "learning_rate": 6.149425287356322e-07, | |
| "loss": 0.002, | |
| "reward": 1.8941223621368408, | |
| "reward_std": 0.02138252556324005, | |
| "rewards/accuracy_reward": 0.8947733640670776, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.89974212646484, | |
| "epoch": 3.862068965517241, | |
| "grad_norm": 2.706717328267551, | |
| "kl": 0.046875, | |
| "learning_rate": 6.137931034482758e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8608148097991943, | |
| "reward_std": 0.02197238616645336, | |
| "rewards/accuracy_reward": 0.8608149886131287, | |
| "rewards/format_reward": 1.0, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.17839050292969, | |
| "epoch": 3.873563218390805, | |
| "grad_norm": 2.9331158810141704, | |
| "kl": 0.043212890625, | |
| "learning_rate": 6.126436781609195e-07, | |
| "loss": 0.0018, | |
| "reward": 1.877394676208496, | |
| "reward_std": 0.022260598838329315, | |
| "rewards/accuracy_reward": 0.8773946762084961, | |
| "rewards/format_reward": 1.0, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.91732025146484, | |
| "epoch": 3.8850574712643677, | |
| "grad_norm": 1.7505237848600723, | |
| "kl": 0.046875, | |
| "learning_rate": 6.114942528735632e-07, | |
| "loss": 0.002, | |
| "reward": 1.8993761539459229, | |
| "reward_std": 0.01834731549024582, | |
| "rewards/accuracy_reward": 0.8993761539459229, | |
| "rewards/format_reward": 1.0, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.50456237792969, | |
| "epoch": 3.896551724137931, | |
| "grad_norm": 1.8091603678916988, | |
| "kl": 0.04541015625, | |
| "learning_rate": 6.103448275862068e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8772138357162476, | |
| "reward_std": 0.02404957078397274, | |
| "rewards/accuracy_reward": 0.8778649568557739, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.89453125, | |
| "epoch": 3.9080459770114944, | |
| "grad_norm": 2.7420144960936064, | |
| "kl": 0.0439453125, | |
| "learning_rate": 6.091954022988506e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8750686645507812, | |
| "reward_std": 0.02069086581468582, | |
| "rewards/accuracy_reward": 0.8750687837600708, | |
| "rewards/format_reward": 1.0, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.76692962646484, | |
| "epoch": 3.9195402298850572, | |
| "grad_norm": 1.3320968254982721, | |
| "kl": 0.042724609375, | |
| "learning_rate": 6.080459770114942e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8823789358139038, | |
| "reward_std": 0.020603572949767113, | |
| "rewards/accuracy_reward": 0.8823789358139038, | |
| "rewards/format_reward": 1.0, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.79232025146484, | |
| "epoch": 3.9310344827586206, | |
| "grad_norm": 2.6292451489584363, | |
| "kl": 0.042236328125, | |
| "learning_rate": 6.068965517241379e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8959369659423828, | |
| "reward_std": 0.02122236043214798, | |
| "rewards/accuracy_reward": 0.8965880274772644, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.26237487792969, | |
| "epoch": 3.942528735632184, | |
| "grad_norm": 1.1786665172082536, | |
| "kl": 0.041748046875, | |
| "learning_rate": 6.057471264367816e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8724334239959717, | |
| "reward_std": 0.02036610245704651, | |
| "rewards/accuracy_reward": 0.8724333643913269, | |
| "rewards/format_reward": 1.0, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.0703125, | |
| "epoch": 3.954022988505747, | |
| "grad_norm": 1.8258343549622353, | |
| "kl": 0.041015625, | |
| "learning_rate": 6.045977011494252e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8895347118377686, | |
| "reward_std": 0.021330809220671654, | |
| "rewards/accuracy_reward": 0.8901857137680054, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.08984375, | |
| "epoch": 3.9655172413793105, | |
| "grad_norm": 1.2316728903522134, | |
| "kl": 0.04150390625, | |
| "learning_rate": 6.03448275862069e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8853051662445068, | |
| "reward_std": 0.020341580733656883, | |
| "rewards/accuracy_reward": 0.8853051662445068, | |
| "rewards/format_reward": 1.0, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.59765625, | |
| "epoch": 3.9770114942528734, | |
| "grad_norm": 3.7347087879528753, | |
| "kl": 0.0498046875, | |
| "learning_rate": 6.022988505747126e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8948701620101929, | |
| "reward_std": 0.020036697387695312, | |
| "rewards/accuracy_reward": 0.8948701620101929, | |
| "rewards/format_reward": 1.0, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.96745300292969, | |
| "epoch": 3.9885057471264367, | |
| "grad_norm": 1.374220142289457, | |
| "kl": 0.047119140625, | |
| "learning_rate": 6.011494252873563e-07, | |
| "loss": 0.002, | |
| "reward": 1.878260612487793, | |
| "reward_std": 0.020665448158979416, | |
| "rewards/accuracy_reward": 0.8782605528831482, | |
| "rewards/format_reward": 1.0, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.40168762207031, | |
| "epoch": 4.0, | |
| "grad_norm": 1.3796700468562093, | |
| "kl": 0.0361328125, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8900009393692017, | |
| "reward_std": 0.0169126745313406, | |
| "rewards/accuracy_reward": 0.890001118183136, | |
| "rewards/format_reward": 1.0, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.21484375, | |
| "epoch": 4.011494252873563, | |
| "grad_norm": 1.4078584124283187, | |
| "kl": 0.036376953125, | |
| "learning_rate": 5.988505747126437e-07, | |
| "loss": 0.0015, | |
| "reward": 1.861340880393982, | |
| "reward_std": 0.024126818403601646, | |
| "rewards/accuracy_reward": 0.8613407611846924, | |
| "rewards/format_reward": 1.0, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.14388275146484, | |
| "epoch": 4.022988505747127, | |
| "grad_norm": 1.9884071163588437, | |
| "kl": 0.044189453125, | |
| "learning_rate": 5.977011494252874e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8884938955307007, | |
| "reward_std": 0.02110714465379715, | |
| "rewards/accuracy_reward": 0.8884940147399902, | |
| "rewards/format_reward": 1.0, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.16341400146484, | |
| "epoch": 4.0344827586206895, | |
| "grad_norm": 1.4221350160981188, | |
| "kl": 0.036376953125, | |
| "learning_rate": 5.96551724137931e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8791723251342773, | |
| "reward_std": 0.022562285885214806, | |
| "rewards/accuracy_reward": 0.8791724443435669, | |
| "rewards/format_reward": 1.0, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.138671875, | |
| "epoch": 4.045977011494253, | |
| "grad_norm": 1.654901894003328, | |
| "kl": 0.038818359375, | |
| "learning_rate": 5.954022988505747e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8821697235107422, | |
| "reward_std": 0.02151145413517952, | |
| "rewards/accuracy_reward": 0.8821697235107422, | |
| "rewards/format_reward": 1.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.92839050292969, | |
| "epoch": 4.057471264367816, | |
| "grad_norm": 1.5963537501741232, | |
| "kl": 0.039306640625, | |
| "learning_rate": 5.942528735632184e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8790719509124756, | |
| "reward_std": 0.021063879132270813, | |
| "rewards/accuracy_reward": 0.8797230124473572, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.041015625, | |
| "epoch": 4.068965517241379, | |
| "grad_norm": 1.8114870192410322, | |
| "kl": 0.03759765625, | |
| "learning_rate": 5.93103448275862e-07, | |
| "loss": 0.0016, | |
| "reward": 1.865633487701416, | |
| "reward_std": 0.0278116874396801, | |
| "rewards/accuracy_reward": 0.8669356107711792, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82357025146484, | |
| "epoch": 4.080459770114943, | |
| "grad_norm": 2.8490723955207993, | |
| "kl": 0.037841796875, | |
| "learning_rate": 5.919540229885057e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8946452140808105, | |
| "reward_std": 0.021742399781942368, | |
| "rewards/accuracy_reward": 0.8946452140808105, | |
| "rewards/format_reward": 1.0, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.27474212646484, | |
| "epoch": 4.091954022988506, | |
| "grad_norm": 1.5566047273728554, | |
| "kl": 0.037353515625, | |
| "learning_rate": 5.908045977011494e-07, | |
| "loss": 0.0016, | |
| "reward": 1.875108242034912, | |
| "reward_std": 0.021865393966436386, | |
| "rewards/accuracy_reward": 0.8751082420349121, | |
| "rewards/format_reward": 1.0, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.48763275146484, | |
| "epoch": 4.103448275862069, | |
| "grad_norm": 1.9776553067288296, | |
| "kl": 0.03564453125, | |
| "learning_rate": 5.89655172413793e-07, | |
| "loss": 0.0015, | |
| "reward": 1.881319284439087, | |
| "reward_std": 0.024803385138511658, | |
| "rewards/accuracy_reward": 0.8813192248344421, | |
| "rewards/format_reward": 1.0, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.21940612792969, | |
| "epoch": 4.114942528735632, | |
| "grad_norm": 1.6859022821903558, | |
| "kl": 0.0361328125, | |
| "learning_rate": 5.885057471264368e-07, | |
| "loss": 0.0015, | |
| "reward": 1.86405348777771, | |
| "reward_std": 0.023439563810825348, | |
| "rewards/accuracy_reward": 0.8653554916381836, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.50521087646484, | |
| "epoch": 4.126436781609195, | |
| "grad_norm": 2.4612604405299474, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.873563218390805e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8807547092437744, | |
| "reward_std": 0.02118275687098503, | |
| "rewards/accuracy_reward": 0.880754828453064, | |
| "rewards/format_reward": 1.0, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.39714050292969, | |
| "epoch": 4.137931034482759, | |
| "grad_norm": 1.6609596832825777, | |
| "kl": 0.035888671875, | |
| "learning_rate": 5.86206896551724e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8441890478134155, | |
| "reward_std": 0.023664182052016258, | |
| "rewards/accuracy_reward": 0.8441890478134155, | |
| "rewards/format_reward": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.435546875, | |
| "epoch": 4.149425287356322, | |
| "grad_norm": 2.5111143452832625, | |
| "kl": 0.034912109375, | |
| "learning_rate": 5.850574712643678e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8899774551391602, | |
| "reward_std": 0.026515571400523186, | |
| "rewards/accuracy_reward": 0.8906285762786865, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.841796875, | |
| "epoch": 4.160919540229885, | |
| "grad_norm": 3.9427924775939784, | |
| "kl": 0.037353515625, | |
| "learning_rate": 5.839080459770115e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8844678401947021, | |
| "reward_std": 0.02146860770881176, | |
| "rewards/accuracy_reward": 0.8844677805900574, | |
| "rewards/format_reward": 1.0, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.84765625, | |
| "epoch": 4.172413793103448, | |
| "grad_norm": 1.6920489480855494, | |
| "kl": 0.037109375, | |
| "learning_rate": 5.827586206896552e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8880507946014404, | |
| "reward_std": 0.022687291726469994, | |
| "rewards/accuracy_reward": 0.8880506753921509, | |
| "rewards/format_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.06315612792969, | |
| "epoch": 4.183908045977011, | |
| "grad_norm": 2.152778241207781, | |
| "kl": 0.037353515625, | |
| "learning_rate": 5.816091954022988e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8860154151916504, | |
| "reward_std": 0.02288450300693512, | |
| "rewards/accuracy_reward": 0.8860155940055847, | |
| "rewards/format_reward": 1.0, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.43034362792969, | |
| "epoch": 4.195402298850575, | |
| "grad_norm": 1.3564134180585476, | |
| "kl": 0.037353515625, | |
| "learning_rate": 5.804597701149425e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8712117671966553, | |
| "reward_std": 0.02276797592639923, | |
| "rewards/accuracy_reward": 0.8712117671966553, | |
| "rewards/format_reward": 1.0, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.294921875, | |
| "epoch": 4.206896551724138, | |
| "grad_norm": 1.1730079534850056e+29, | |
| "kl": 2.937689741663549e+26, | |
| "learning_rate": 5.793103448275862e-07, | |
| "loss": 1.1743651528396316e+25, | |
| "reward": 1.8752689361572266, | |
| "reward_std": 0.025293130427598953, | |
| "rewards/accuracy_reward": 0.8765711784362793, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.33203125, | |
| "epoch": 4.218390804597701, | |
| "grad_norm": 1.492960779675916, | |
| "kl": 0.038818359375, | |
| "learning_rate": 5.781609195402298e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8979356288909912, | |
| "reward_std": 0.023821339011192322, | |
| "rewards/accuracy_reward": 0.8985867500305176, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.62630462646484, | |
| "epoch": 4.2298850574712645, | |
| "grad_norm": 1.4743009917575458, | |
| "kl": 0.040283203125, | |
| "learning_rate": 5.770114942528736e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9055975675582886, | |
| "reward_std": 0.019971728324890137, | |
| "rewards/accuracy_reward": 0.9055975675582886, | |
| "rewards/format_reward": 1.0, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.99674987792969, | |
| "epoch": 4.241379310344827, | |
| "grad_norm": 3.9302261665663982, | |
| "kl": 0.0400390625, | |
| "learning_rate": 5.758620689655173e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8823680877685547, | |
| "reward_std": 0.022455014288425446, | |
| "rewards/accuracy_reward": 0.8823680877685547, | |
| "rewards/format_reward": 1.0, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.39583587646484, | |
| "epoch": 4.252873563218391, | |
| "grad_norm": 2.0938326905522238, | |
| "kl": 0.0400390625, | |
| "learning_rate": 5.747126436781608e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8727083206176758, | |
| "reward_std": 0.02592053823173046, | |
| "rewards/accuracy_reward": 0.8733593821525574, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.81706237792969, | |
| "epoch": 4.264367816091954, | |
| "grad_norm": 2.888553640412068, | |
| "kl": 0.040771484375, | |
| "learning_rate": 5.735632183908046e-07, | |
| "loss": 0.0017, | |
| "reward": 1.882491111755371, | |
| "reward_std": 0.020577851682901382, | |
| "rewards/accuracy_reward": 0.8824911117553711, | |
| "rewards/format_reward": 1.0, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.640625, | |
| "epoch": 4.275862068965517, | |
| "grad_norm": 1.4686082294156166, | |
| "kl": 0.051513671875, | |
| "learning_rate": 5.724137931034483e-07, | |
| "loss": 0.0021, | |
| "reward": 1.882289171218872, | |
| "reward_std": 0.023612579330801964, | |
| "rewards/accuracy_reward": 0.8822891712188721, | |
| "rewards/format_reward": 1.0, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.6875, | |
| "epoch": 4.287356321839081, | |
| "grad_norm": 2.244464531990695, | |
| "kl": 0.040283203125, | |
| "learning_rate": 5.712643678160918e-07, | |
| "loss": 0.0017, | |
| "reward": 1.886061429977417, | |
| "reward_std": 0.023760950192809105, | |
| "rewards/accuracy_reward": 0.8867123126983643, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.7734375, | |
| "epoch": 4.2988505747126435, | |
| "grad_norm": 1.5023210240131932, | |
| "kl": 0.040771484375, | |
| "learning_rate": 5.701149425287356e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8881802558898926, | |
| "reward_std": 0.019625011831521988, | |
| "rewards/accuracy_reward": 0.8881802558898926, | |
| "rewards/format_reward": 1.0, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.12044525146484, | |
| "epoch": 4.310344827586207, | |
| "grad_norm": 1.757713278971671, | |
| "kl": 0.038330078125, | |
| "learning_rate": 5.689655172413793e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8772163391113281, | |
| "reward_std": 0.022540029138326645, | |
| "rewards/accuracy_reward": 0.8772165179252625, | |
| "rewards/format_reward": 1.0, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.61458587646484, | |
| "epoch": 4.32183908045977, | |
| "grad_norm": 1.6452684635685983, | |
| "kl": 0.038818359375, | |
| "learning_rate": 5.678160919540229e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8954042196273804, | |
| "reward_std": 0.02163463830947876, | |
| "rewards/accuracy_reward": 0.8967063426971436, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.05208587646484, | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 2.2555305608969154, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.666666666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8833587169647217, | |
| "reward_std": 0.021704208105802536, | |
| "rewards/accuracy_reward": 0.8833586573600769, | |
| "rewards/format_reward": 1.0, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.78450775146484, | |
| "epoch": 4.344827586206897, | |
| "grad_norm": 2.212131167395634, | |
| "kl": 0.04541015625, | |
| "learning_rate": 5.655172413793103e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8870363235473633, | |
| "reward_std": 0.020737424492836, | |
| "rewards/accuracy_reward": 0.8870364427566528, | |
| "rewards/format_reward": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.40104675292969, | |
| "epoch": 4.35632183908046, | |
| "grad_norm": 1.529590167352119, | |
| "kl": 0.045654296875, | |
| "learning_rate": 5.643678160919541e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8749973773956299, | |
| "reward_std": 0.018837254494428635, | |
| "rewards/accuracy_reward": 0.8749972581863403, | |
| "rewards/format_reward": 1.0, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.90495300292969, | |
| "epoch": 4.3678160919540225, | |
| "grad_norm": 2.1938200030551385, | |
| "kl": 0.04248046875, | |
| "learning_rate": 5.632183908045976e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8977670669555664, | |
| "reward_std": 0.02074752375483513, | |
| "rewards/accuracy_reward": 0.8977672457695007, | |
| "rewards/format_reward": 1.0, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.07096862792969, | |
| "epoch": 4.379310344827586, | |
| "grad_norm": 3.9533429205437813, | |
| "kl": 0.047119140625, | |
| "learning_rate": 5.620689655172414e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9117012023925781, | |
| "reward_std": 0.02050834149122238, | |
| "rewards/accuracy_reward": 0.9117013216018677, | |
| "rewards/format_reward": 1.0, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.44140625, | |
| "epoch": 4.390804597701149, | |
| "grad_norm": 2.4576931014163828, | |
| "kl": 0.04150390625, | |
| "learning_rate": 5.609195402298851e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8892667293548584, | |
| "reward_std": 0.024918202310800552, | |
| "rewards/accuracy_reward": 0.8899178504943848, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.494140625, | |
| "epoch": 4.402298850574713, | |
| "grad_norm": 2.1902999394095324, | |
| "kl": 0.04150390625, | |
| "learning_rate": 5.597701149425286e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9016425609588623, | |
| "reward_std": 0.018868938088417053, | |
| "rewards/accuracy_reward": 0.9016425013542175, | |
| "rewards/format_reward": 1.0, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.78515625, | |
| "epoch": 4.413793103448276, | |
| "grad_norm": 1.883452563889067, | |
| "kl": 0.041259765625, | |
| "learning_rate": 5.586206896551724e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8980283737182617, | |
| "reward_std": 0.021328266710042953, | |
| "rewards/accuracy_reward": 0.8986794352531433, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.46745300292969, | |
| "epoch": 4.425287356321839, | |
| "grad_norm": 2.3714294651121084, | |
| "kl": 0.042724609375, | |
| "learning_rate": 5.574712643678161e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8801054954528809, | |
| "reward_std": 0.02180486172437668, | |
| "rewards/accuracy_reward": 0.8801054954528809, | |
| "rewards/format_reward": 1.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.17057800292969, | |
| "epoch": 4.436781609195402, | |
| "grad_norm": 1.3480188133058681, | |
| "kl": 0.044189453125, | |
| "learning_rate": 5.563218390804598e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9038050174713135, | |
| "reward_std": 0.023260444402694702, | |
| "rewards/accuracy_reward": 0.9044560194015503, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.60807800292969, | |
| "epoch": 4.448275862068965, | |
| "grad_norm": 1.558932018548512, | |
| "kl": 0.040283203125, | |
| "learning_rate": 5.551724137931034e-07, | |
| "loss": 0.0016, | |
| "reward": 1.883833646774292, | |
| "reward_std": 0.021813951432704926, | |
| "rewards/accuracy_reward": 0.883833646774292, | |
| "rewards/format_reward": 1.0, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.23372650146484, | |
| "epoch": 4.459770114942529, | |
| "grad_norm": 3.4341748113797994, | |
| "kl": 0.042724609375, | |
| "learning_rate": 5.540229885057471e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8973731994628906, | |
| "reward_std": 0.021094098687171936, | |
| "rewards/accuracy_reward": 0.8973731398582458, | |
| "rewards/format_reward": 1.0, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.3828125, | |
| "epoch": 4.471264367816092, | |
| "grad_norm": 2.3537727105401713, | |
| "kl": 0.045654296875, | |
| "learning_rate": 5.528735632183908e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9044450521469116, | |
| "reward_std": 0.02275899611413479, | |
| "rewards/accuracy_reward": 0.9050959944725037, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.58528900146484, | |
| "epoch": 4.482758620689655, | |
| "grad_norm": 2.091387625283226, | |
| "kl": 0.044921875, | |
| "learning_rate": 5.517241379310344e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8750637769699097, | |
| "reward_std": 0.02193189226090908, | |
| "rewards/accuracy_reward": 0.8750636577606201, | |
| "rewards/format_reward": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.68489837646484, | |
| "epoch": 4.494252873563219, | |
| "grad_norm": 1.563517095069176, | |
| "kl": 0.0439453125, | |
| "learning_rate": 5.505747126436782e-07, | |
| "loss": 0.0018, | |
| "reward": 1.895681381225586, | |
| "reward_std": 0.021227214485406876, | |
| "rewards/accuracy_reward": 0.8956813812255859, | |
| "rewards/format_reward": 1.0, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.5703125, | |
| "epoch": 4.505747126436781, | |
| "grad_norm": 1.4561203507859735, | |
| "kl": 0.044677734375, | |
| "learning_rate": 5.494252873563218e-07, | |
| "loss": 0.0018, | |
| "reward": 1.903104543685913, | |
| "reward_std": 0.019182192161679268, | |
| "rewards/accuracy_reward": 0.9031044840812683, | |
| "rewards/format_reward": 1.0, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.791015625, | |
| "epoch": 4.517241379310345, | |
| "grad_norm": 2.20068200415075, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.482758620689654e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8813633918762207, | |
| "reward_std": 0.02065703272819519, | |
| "rewards/accuracy_reward": 0.8813633918762207, | |
| "rewards/format_reward": 1.0, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.048828125, | |
| "epoch": 4.528735632183908, | |
| "grad_norm": 2.153835030110258, | |
| "kl": 0.0400390625, | |
| "learning_rate": 5.471264367816092e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8950791358947754, | |
| "reward_std": 0.018896300345659256, | |
| "rewards/accuracy_reward": 0.8950790762901306, | |
| "rewards/format_reward": 1.0, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.44206237792969, | |
| "epoch": 4.540229885057471, | |
| "grad_norm": 1.8689205868418028, | |
| "kl": 0.04931640625, | |
| "learning_rate": 5.459770114942529e-07, | |
| "loss": 0.002, | |
| "reward": 1.8724088668823242, | |
| "reward_std": 0.022876422852277756, | |
| "rewards/accuracy_reward": 0.8730599880218506, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.875, | |
| "epoch": 4.551724137931035, | |
| "grad_norm": 1.6153380302756586, | |
| "kl": 0.040283203125, | |
| "learning_rate": 5.448275862068966e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8891279697418213, | |
| "reward_std": 0.022592080757021904, | |
| "rewards/accuracy_reward": 0.8897790312767029, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.71614837646484, | |
| "epoch": 4.563218390804598, | |
| "grad_norm": 1.5101980917336142, | |
| "kl": 0.039306640625, | |
| "learning_rate": 5.436781609195402e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8952821493148804, | |
| "reward_std": 0.02106665074825287, | |
| "rewards/accuracy_reward": 0.8952820301055908, | |
| "rewards/format_reward": 1.0, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.232421875, | |
| "epoch": 4.574712643678161, | |
| "grad_norm": 1.4461465189423137, | |
| "kl": 0.040283203125, | |
| "learning_rate": 5.425287356321839e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8775584697723389, | |
| "reward_std": 0.025320343673229218, | |
| "rewards/accuracy_reward": 0.8782094717025757, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.45052337646484, | |
| "epoch": 4.586206896551724, | |
| "grad_norm": 1.2782259292486342, | |
| "kl": 0.038818359375, | |
| "learning_rate": 5.413793103448276e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8783025741577148, | |
| "reward_std": 0.024571552872657776, | |
| "rewards/accuracy_reward": 0.8789536356925964, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.37435150146484, | |
| "epoch": 4.597701149425287, | |
| "grad_norm": 4.341249687805903, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.402298850574712e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8882777690887451, | |
| "reward_std": 0.025847896933555603, | |
| "rewards/accuracy_reward": 0.8895797729492188, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.140625, | |
| "epoch": 4.609195402298851, | |
| "grad_norm": 1.3663166069622814, | |
| "kl": 0.03759765625, | |
| "learning_rate": 5.39080459770115e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8888124227523804, | |
| "reward_std": 0.025306126102805138, | |
| "rewards/accuracy_reward": 0.8901144862174988, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.66471862792969, | |
| "epoch": 4.620689655172414, | |
| "grad_norm": 1.6302378119468088, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.379310344827586e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8746542930603027, | |
| "reward_std": 0.02441917359828949, | |
| "rewards/accuracy_reward": 0.8753053545951843, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.66276550292969, | |
| "epoch": 4.6321839080459775, | |
| "grad_norm": 1.4816501158522215, | |
| "kl": 0.039794921875, | |
| "learning_rate": 5.367816091954022e-07, | |
| "loss": 0.0016, | |
| "reward": 1.894063949584961, | |
| "reward_std": 0.024088043719530106, | |
| "rewards/accuracy_reward": 0.8947149515151978, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.3828125, | |
| "epoch": 4.64367816091954, | |
| "grad_norm": 1.6836651779382024, | |
| "kl": 0.04345703125, | |
| "learning_rate": 5.35632183908046e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8845057487487793, | |
| "reward_std": 0.02305610477924347, | |
| "rewards/accuracy_reward": 0.8845058679580688, | |
| "rewards/format_reward": 1.0, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.69596862792969, | |
| "epoch": 4.655172413793103, | |
| "grad_norm": 1.1778273851354586, | |
| "kl": 0.04345703125, | |
| "learning_rate": 5.344827586206896e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8848464488983154, | |
| "reward_std": 0.02404012344777584, | |
| "rewards/accuracy_reward": 0.8854975700378418, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.251953125, | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 1.427234022329123, | |
| "kl": 0.043212890625, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9008078575134277, | |
| "reward_std": 0.020701348781585693, | |
| "rewards/accuracy_reward": 0.9008078575134277, | |
| "rewards/format_reward": 1.0, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.42513275146484, | |
| "epoch": 4.67816091954023, | |
| "grad_norm": 2.5640372485607696, | |
| "kl": 0.04296875, | |
| "learning_rate": 5.32183908045977e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8793271780014038, | |
| "reward_std": 0.026321526616811752, | |
| "rewards/accuracy_reward": 0.8806290626525879, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.38932800292969, | |
| "epoch": 4.689655172413794, | |
| "grad_norm": 2.3519843115525343, | |
| "kl": 0.041259765625, | |
| "learning_rate": 5.310344827586206e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9006179571151733, | |
| "reward_std": 0.0217905193567276, | |
| "rewards/accuracy_reward": 0.9006179571151733, | |
| "rewards/format_reward": 1.0, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.32161712646484, | |
| "epoch": 4.7011494252873565, | |
| "grad_norm": 3.886735063581474, | |
| "kl": 0.0458984375, | |
| "learning_rate": 5.298850574712644e-07, | |
| "loss": 0.0019, | |
| "reward": 1.904909372329712, | |
| "reward_std": 0.019571471959352493, | |
| "rewards/accuracy_reward": 0.9049093127250671, | |
| "rewards/format_reward": 1.0, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.49935150146484, | |
| "epoch": 4.712643678160919, | |
| "grad_norm": 1.9215943999639125, | |
| "kl": 0.045654296875, | |
| "learning_rate": 5.28735632183908e-07, | |
| "loss": 0.0019, | |
| "reward": 1.896188735961914, | |
| "reward_std": 0.02258872799575329, | |
| "rewards/accuracy_reward": 0.8961889147758484, | |
| "rewards/format_reward": 1.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.84635925292969, | |
| "epoch": 4.724137931034483, | |
| "grad_norm": 2.323925371294666, | |
| "kl": 0.039794921875, | |
| "learning_rate": 5.275862068965517e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8925457000732422, | |
| "reward_std": 0.01859492063522339, | |
| "rewards/accuracy_reward": 0.8925456404685974, | |
| "rewards/format_reward": 1.0, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.20638275146484, | |
| "epoch": 4.735632183908046, | |
| "grad_norm": 1.655400396429134, | |
| "kl": 0.041748046875, | |
| "learning_rate": 5.264367816091954e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8739862442016602, | |
| "reward_std": 0.021640470251441002, | |
| "rewards/accuracy_reward": 0.8746374845504761, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.99153900146484, | |
| "epoch": 4.747126436781609, | |
| "grad_norm": 1.9363409073368856, | |
| "kl": 0.049072265625, | |
| "learning_rate": 5.252873563218391e-07, | |
| "loss": 0.002, | |
| "reward": 1.8912228345870972, | |
| "reward_std": 0.021241484209895134, | |
| "rewards/accuracy_reward": 0.8918737769126892, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.73763275146484, | |
| "epoch": 4.758620689655173, | |
| "grad_norm": 2.279760774245835, | |
| "kl": 0.04052734375, | |
| "learning_rate": 5.241379310344828e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9129436016082764, | |
| "reward_std": 0.01870916411280632, | |
| "rewards/accuracy_reward": 0.9129435420036316, | |
| "rewards/format_reward": 1.0, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.61003112792969, | |
| "epoch": 4.7701149425287355, | |
| "grad_norm": 2.45904909614863, | |
| "kl": 0.03515625, | |
| "learning_rate": 5.229885057471264e-07, | |
| "loss": 0.0015, | |
| "reward": 1.9052283763885498, | |
| "reward_std": 0.019063513725996017, | |
| "rewards/accuracy_reward": 0.9052283763885498, | |
| "rewards/format_reward": 1.0, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.0078125, | |
| "epoch": 4.781609195402299, | |
| "grad_norm": 1.6845003543176094, | |
| "kl": 0.041015625, | |
| "learning_rate": 5.218390804597701e-07, | |
| "loss": 0.0017, | |
| "reward": 1.894358515739441, | |
| "reward_std": 0.022152118384838104, | |
| "rewards/accuracy_reward": 0.8950096964836121, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.95833587646484, | |
| "epoch": 4.793103448275862, | |
| "grad_norm": 4.433200172934703, | |
| "kl": 0.0634765625, | |
| "learning_rate": 5.206896551724138e-07, | |
| "loss": 0.0026, | |
| "reward": 1.8981165885925293, | |
| "reward_std": 0.020231489092111588, | |
| "rewards/accuracy_reward": 0.8981165885925293, | |
| "rewards/format_reward": 1.0, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.35482025146484, | |
| "epoch": 4.804597701149425, | |
| "grad_norm": 1.7763571926472603, | |
| "kl": 0.042724609375, | |
| "learning_rate": 5.195402298850574e-07, | |
| "loss": 0.0018, | |
| "reward": 1.906449556350708, | |
| "reward_std": 0.020989403128623962, | |
| "rewards/accuracy_reward": 0.9064494967460632, | |
| "rewards/format_reward": 1.0, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.24089050292969, | |
| "epoch": 4.816091954022989, | |
| "grad_norm": 1.4886525030955973, | |
| "kl": 0.04736328125, | |
| "learning_rate": 5.183908045977012e-07, | |
| "loss": 0.002, | |
| "reward": 1.8850057125091553, | |
| "reward_std": 0.020064577460289, | |
| "rewards/accuracy_reward": 0.8850056529045105, | |
| "rewards/format_reward": 1.0, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.37565612792969, | |
| "epoch": 4.827586206896552, | |
| "grad_norm": 2.8859840023098866, | |
| "kl": 0.0390625, | |
| "learning_rate": 5.172413793103448e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8768961429595947, | |
| "reward_std": 0.018807468935847282, | |
| "rewards/accuracy_reward": 0.8768962621688843, | |
| "rewards/format_reward": 1.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.56380462646484, | |
| "epoch": 4.8390804597701145, | |
| "grad_norm": 1.8199908836931964, | |
| "kl": 0.04150390625, | |
| "learning_rate": 5.160919540229884e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8867862224578857, | |
| "reward_std": 0.022299369797110558, | |
| "rewards/accuracy_reward": 0.8867861032485962, | |
| "rewards/format_reward": 1.0, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.96484375, | |
| "epoch": 4.850574712643678, | |
| "grad_norm": 1.415400095170082, | |
| "kl": 0.042724609375, | |
| "learning_rate": 5.149425287356322e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9118112325668335, | |
| "reward_std": 0.019953366369009018, | |
| "rewards/accuracy_reward": 0.9118112325668335, | |
| "rewards/format_reward": 1.0, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.650390625, | |
| "epoch": 4.862068965517241, | |
| "grad_norm": 2.027158566984518, | |
| "kl": 0.044189453125, | |
| "learning_rate": 5.137931034482759e-07, | |
| "loss": 0.0018, | |
| "reward": 1.878554344177246, | |
| "reward_std": 0.019561052322387695, | |
| "rewards/accuracy_reward": 0.8785543441772461, | |
| "rewards/format_reward": 1.0, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.13932800292969, | |
| "epoch": 4.873563218390805, | |
| "grad_norm": 3.123515896042613, | |
| "kl": 0.048095703125, | |
| "learning_rate": 5.126436781609194e-07, | |
| "loss": 0.002, | |
| "reward": 1.8973406553268433, | |
| "reward_std": 0.021455293521285057, | |
| "rewards/accuracy_reward": 0.8973406553268433, | |
| "rewards/format_reward": 1.0, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.84831237792969, | |
| "epoch": 4.885057471264368, | |
| "grad_norm": 2.2665060638151737, | |
| "kl": 0.05078125, | |
| "learning_rate": 5.114942528735632e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8799655437469482, | |
| "reward_std": 0.02300919219851494, | |
| "rewards/accuracy_reward": 0.8799654841423035, | |
| "rewards/format_reward": 1.0, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.94401550292969, | |
| "epoch": 4.896551724137931, | |
| "grad_norm": 1.6919605234703265, | |
| "kl": 0.0556640625, | |
| "learning_rate": 5.103448275862069e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9173061847686768, | |
| "reward_std": 0.01950909197330475, | |
| "rewards/accuracy_reward": 0.9173061847686768, | |
| "rewards/format_reward": 1.0, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.74153900146484, | |
| "epoch": 4.908045977011494, | |
| "grad_norm": 2.1229070642282553, | |
| "kl": 0.04931640625, | |
| "learning_rate": 5.091954022988506e-07, | |
| "loss": 0.002, | |
| "reward": 1.8742494583129883, | |
| "reward_std": 0.023737162351608276, | |
| "rewards/accuracy_reward": 0.8742495775222778, | |
| "rewards/format_reward": 1.0, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.42643737792969, | |
| "epoch": 4.919540229885057, | |
| "grad_norm": 2.1832269520490892, | |
| "kl": 0.052001953125, | |
| "learning_rate": 5.080459770114942e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8955520391464233, | |
| "reward_std": 0.020405521616339684, | |
| "rewards/accuracy_reward": 0.8955520391464233, | |
| "rewards/format_reward": 1.0, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.27214050292969, | |
| "epoch": 4.931034482758621, | |
| "grad_norm": 2.536754289442291, | |
| "kl": 0.0546875, | |
| "learning_rate": 5.068965517241379e-07, | |
| "loss": 0.0023, | |
| "reward": 1.896885633468628, | |
| "reward_std": 0.020467374473810196, | |
| "rewards/accuracy_reward": 0.8968856334686279, | |
| "rewards/format_reward": 1.0, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.57357025146484, | |
| "epoch": 4.942528735632184, | |
| "grad_norm": 1.3718070549122303, | |
| "kl": 0.05517578125, | |
| "learning_rate": 5.057471264367817e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8929929733276367, | |
| "reward_std": 0.024712808430194855, | |
| "rewards/accuracy_reward": 0.8929929733276367, | |
| "rewards/format_reward": 1.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.10807800292969, | |
| "epoch": 4.954022988505747, | |
| "grad_norm": 2.668322989932513, | |
| "kl": 0.05322265625, | |
| "learning_rate": 5.045977011494252e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8981032371520996, | |
| "reward_std": 0.020073171705007553, | |
| "rewards/accuracy_reward": 0.8981032371520996, | |
| "rewards/format_reward": 1.0, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.62630462646484, | |
| "epoch": 4.9655172413793105, | |
| "grad_norm": 2.4540498394359047, | |
| "kl": 0.0546875, | |
| "learning_rate": 5.03448275862069e-07, | |
| "loss": 0.0023, | |
| "reward": 1.8993358612060547, | |
| "reward_std": 0.018647989258170128, | |
| "rewards/accuracy_reward": 0.8993359804153442, | |
| "rewards/format_reward": 1.0, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.70964050292969, | |
| "epoch": 4.977011494252873, | |
| "grad_norm": 1.963612640849598, | |
| "kl": 0.052734375, | |
| "learning_rate": 5.022988505747127e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8898952007293701, | |
| "reward_std": 0.022081639617681503, | |
| "rewards/accuracy_reward": 0.8898952007293701, | |
| "rewards/format_reward": 1.0, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.87109375, | |
| "epoch": 4.988505747126437, | |
| "grad_norm": 2.3000221102493157, | |
| "kl": 0.046142578125, | |
| "learning_rate": 5.011494252873562e-07, | |
| "loss": 0.0019, | |
| "reward": 1.89603853225708, | |
| "reward_std": 0.020961783826351166, | |
| "rewards/accuracy_reward": 0.8960385322570801, | |
| "rewards/format_reward": 1.0, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.20084381103516, | |
| "epoch": 5.0, | |
| "grad_norm": 1.792728503326833, | |
| "kl": 0.045166015625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9164615869522095, | |
| "reward_std": 0.01855144090950489, | |
| "rewards/accuracy_reward": 0.9164614677429199, | |
| "rewards/format_reward": 1.0, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.03125, | |
| "epoch": 5.011494252873563, | |
| "grad_norm": 2.411488733190351, | |
| "kl": 0.042236328125, | |
| "learning_rate": 4.988505747126436e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8886754512786865, | |
| "reward_std": 0.023060960695147514, | |
| "rewards/accuracy_reward": 0.8886754512786865, | |
| "rewards/format_reward": 1.0, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.51302337646484, | |
| "epoch": 5.022988505747127, | |
| "grad_norm": 1.2846385937268994, | |
| "kl": 0.04638671875, | |
| "learning_rate": 4.977011494252874e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8596938848495483, | |
| "reward_std": 0.02523483708500862, | |
| "rewards/accuracy_reward": 0.8603450059890747, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.48177337646484, | |
| "epoch": 5.0344827586206895, | |
| "grad_norm": 4.302610321521328, | |
| "kl": 0.04638671875, | |
| "learning_rate": 4.96551724137931e-07, | |
| "loss": 0.0019, | |
| "reward": 1.875889539718628, | |
| "reward_std": 0.02352536842226982, | |
| "rewards/accuracy_reward": 0.8758895397186279, | |
| "rewards/format_reward": 1.0, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.92578125, | |
| "epoch": 5.045977011494253, | |
| "grad_norm": 1.9265386376130789, | |
| "kl": 0.0390625, | |
| "learning_rate": 4.954022988505746e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8992599248886108, | |
| "reward_std": 0.023157542571425438, | |
| "rewards/accuracy_reward": 0.8999109268188477, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.67448425292969, | |
| "epoch": 5.057471264367816, | |
| "grad_norm": 1.6723222924852237, | |
| "kl": 0.0439453125, | |
| "learning_rate": 4.942528735632184e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8739842176437378, | |
| "reward_std": 0.022657310590147972, | |
| "rewards/accuracy_reward": 0.8746352195739746, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.04167175292969, | |
| "epoch": 5.068965517241379, | |
| "grad_norm": 1.602166960354794, | |
| "kl": 0.039306640625, | |
| "learning_rate": 4.93103448275862e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8998823165893555, | |
| "reward_std": 0.02184360846877098, | |
| "rewards/accuracy_reward": 0.8998822569847107, | |
| "rewards/format_reward": 1.0, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.63932800292969, | |
| "epoch": 5.080459770114943, | |
| "grad_norm": 2.4680045505704205, | |
| "kl": 0.039306640625, | |
| "learning_rate": 4.919540229885058e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8900998830795288, | |
| "reward_std": 0.02164739929139614, | |
| "rewards/accuracy_reward": 0.8900998830795288, | |
| "rewards/format_reward": 1.0, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.908203125, | |
| "epoch": 5.091954022988506, | |
| "grad_norm": 1.390816247246552, | |
| "kl": 0.041748046875, | |
| "learning_rate": 4.908045977011494e-07, | |
| "loss": 0.0017, | |
| "reward": 1.889683723449707, | |
| "reward_std": 0.022919952869415283, | |
| "rewards/accuracy_reward": 0.8896838426589966, | |
| "rewards/format_reward": 1.0, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.91341400146484, | |
| "epoch": 5.103448275862069, | |
| "grad_norm": 1.197765074052055, | |
| "kl": 0.04736328125, | |
| "learning_rate": 4.89655172413793e-07, | |
| "loss": 0.002, | |
| "reward": 1.8937995433807373, | |
| "reward_std": 0.021353360265493393, | |
| "rewards/accuracy_reward": 0.8937994837760925, | |
| "rewards/format_reward": 1.0, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.26692962646484, | |
| "epoch": 5.114942528735632, | |
| "grad_norm": 24.16259559584872, | |
| "kl": 0.0478515625, | |
| "learning_rate": 4.885057471264368e-07, | |
| "loss": 0.002, | |
| "reward": 1.8777693510055542, | |
| "reward_std": 0.019756315276026726, | |
| "rewards/accuracy_reward": 0.8777693510055542, | |
| "rewards/format_reward": 1.0, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.94206237792969, | |
| "epoch": 5.126436781609195, | |
| "grad_norm": 2.3533060150658844, | |
| "kl": 0.050537109375, | |
| "learning_rate": 4.873563218390804e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8900680541992188, | |
| "reward_std": 0.021740447729825974, | |
| "rewards/accuracy_reward": 0.8900680541992188, | |
| "rewards/format_reward": 1.0, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.99544525146484, | |
| "epoch": 5.137931034482759, | |
| "grad_norm": 2.1820025718352136, | |
| "kl": 0.046142578125, | |
| "learning_rate": 4.86206896551724e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8730883598327637, | |
| "reward_std": 0.022952785715460777, | |
| "rewards/accuracy_reward": 0.8730884790420532, | |
| "rewards/format_reward": 1.0, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.60546875, | |
| "epoch": 5.149425287356322, | |
| "grad_norm": 1.6285679666257735, | |
| "kl": 0.0400390625, | |
| "learning_rate": 4.850574712643678e-07, | |
| "loss": 0.0017, | |
| "reward": 1.889723777770996, | |
| "reward_std": 0.02422255277633667, | |
| "rewards/accuracy_reward": 0.8897239565849304, | |
| "rewards/format_reward": 1.0, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.62760925292969, | |
| "epoch": 5.160919540229885, | |
| "grad_norm": 1.838531888615258, | |
| "kl": 0.04150390625, | |
| "learning_rate": 4.839080459770114e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8831251859664917, | |
| "reward_std": 0.023560214787721634, | |
| "rewards/accuracy_reward": 0.8831251859664917, | |
| "rewards/format_reward": 1.0, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.32747650146484, | |
| "epoch": 5.172413793103448, | |
| "grad_norm": 3.3852429919466283, | |
| "kl": 0.04296875, | |
| "learning_rate": 4.827586206896552e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8916248083114624, | |
| "reward_std": 0.018707800656557083, | |
| "rewards/accuracy_reward": 0.891624927520752, | |
| "rewards/format_reward": 1.0, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82747650146484, | |
| "epoch": 5.183908045977011, | |
| "grad_norm": 1.0585987907156273, | |
| "kl": 0.040771484375, | |
| "learning_rate": 4.816091954022988e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8994290828704834, | |
| "reward_std": 0.020674733445048332, | |
| "rewards/accuracy_reward": 0.9000802040100098, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.90950775146484, | |
| "epoch": 5.195402298850575, | |
| "grad_norm": 1.5208325687196538, | |
| "kl": 0.041259765625, | |
| "learning_rate": 4.804597701149424e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8975791931152344, | |
| "reward_std": 0.021052822470664978, | |
| "rewards/accuracy_reward": 0.898230254650116, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.69140625, | |
| "epoch": 5.206896551724138, | |
| "grad_norm": 1.7465767723786063, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.793103448275862e-07, | |
| "loss": 0.0019, | |
| "reward": 1.901599645614624, | |
| "reward_std": 0.019404159858822823, | |
| "rewards/accuracy_reward": 0.9015995860099792, | |
| "rewards/format_reward": 1.0, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.00716400146484, | |
| "epoch": 5.218390804597701, | |
| "grad_norm": 1.7494745095643287, | |
| "kl": 0.04296875, | |
| "learning_rate": 4.781609195402298e-07, | |
| "loss": 0.0018, | |
| "reward": 1.897741675376892, | |
| "reward_std": 0.018847916275262833, | |
| "rewards/accuracy_reward": 0.8977417945861816, | |
| "rewards/format_reward": 1.0, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82552337646484, | |
| "epoch": 5.2298850574712645, | |
| "grad_norm": 149.72760374173768, | |
| "kl": 0.1123046875, | |
| "learning_rate": 4.770114942528736e-07, | |
| "loss": 0.0045, | |
| "reward": 1.9023511409759521, | |
| "reward_std": 0.02475971356034279, | |
| "rewards/accuracy_reward": 0.9030022621154785, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.265625, | |
| "epoch": 5.241379310344827, | |
| "grad_norm": 1.5032856559021295, | |
| "kl": 0.04248046875, | |
| "learning_rate": 4.7586206896551725e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8960156440734863, | |
| "reward_std": 0.020089447498321533, | |
| "rewards/accuracy_reward": 0.8960156440734863, | |
| "rewards/format_reward": 1.0, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.44466400146484, | |
| "epoch": 5.252873563218391, | |
| "grad_norm": 5.380770477839403, | |
| "kl": 0.043701171875, | |
| "learning_rate": 4.747126436781609e-07, | |
| "loss": 0.0018, | |
| "reward": 1.879178762435913, | |
| "reward_std": 0.024046046659350395, | |
| "rewards/accuracy_reward": 0.8798297643661499, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.21419525146484, | |
| "epoch": 5.264367816091954, | |
| "grad_norm": 1.307828461511747, | |
| "kl": 0.039794921875, | |
| "learning_rate": 4.735632183908046e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9000663757324219, | |
| "reward_std": 0.0211566723883152, | |
| "rewards/accuracy_reward": 0.9000665545463562, | |
| "rewards/format_reward": 1.0, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.48828125, | |
| "epoch": 5.275862068965517, | |
| "grad_norm": 1.6755788743402862, | |
| "kl": 0.046142578125, | |
| "learning_rate": 4.7241379310344827e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9154987335205078, | |
| "reward_std": 0.019508585333824158, | |
| "rewards/accuracy_reward": 0.9154989123344421, | |
| "rewards/format_reward": 1.0, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.59505462646484, | |
| "epoch": 5.287356321839081, | |
| "grad_norm": 1.410115167386873, | |
| "kl": 0.048828125, | |
| "learning_rate": 4.712643678160919e-07, | |
| "loss": 0.002, | |
| "reward": 1.8869271278381348, | |
| "reward_std": 0.021610528230667114, | |
| "rewards/accuracy_reward": 0.8875781893730164, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.11653900146484, | |
| "epoch": 5.2988505747126435, | |
| "grad_norm": 2.472073181988844, | |
| "kl": 0.046142578125, | |
| "learning_rate": 4.7011494252873565e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8838642835617065, | |
| "reward_std": 0.020128531381487846, | |
| "rewards/accuracy_reward": 0.8838642835617065, | |
| "rewards/format_reward": 1.0, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.26237487792969, | |
| "epoch": 5.310344827586207, | |
| "grad_norm": 1.982912784034594, | |
| "kl": 0.04638671875, | |
| "learning_rate": 4.689655172413793e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8954026699066162, | |
| "reward_std": 0.020416123792529106, | |
| "rewards/accuracy_reward": 0.8960537910461426, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.216796875, | |
| "epoch": 5.32183908045977, | |
| "grad_norm": 1.9630010216472946, | |
| "kl": 0.048828125, | |
| "learning_rate": 4.678160919540229e-07, | |
| "loss": 0.002, | |
| "reward": 1.8689860105514526, | |
| "reward_std": 0.02052994631230831, | |
| "rewards/accuracy_reward": 0.8689860105514526, | |
| "rewards/format_reward": 1.0, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.13997650146484, | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 2.293151993990854, | |
| "kl": 0.05078125, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8916696310043335, | |
| "reward_std": 0.016695309430360794, | |
| "rewards/accuracy_reward": 0.8916696906089783, | |
| "rewards/format_reward": 1.0, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.533203125, | |
| "epoch": 5.344827586206897, | |
| "grad_norm": 2.180207259559519, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.655172413793103e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8959643840789795, | |
| "reward_std": 0.018700793385505676, | |
| "rewards/accuracy_reward": 0.8959642648696899, | |
| "rewards/format_reward": 1.0, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.427734375, | |
| "epoch": 5.35632183908046, | |
| "grad_norm": 1.8241902873816702, | |
| "kl": 0.0498046875, | |
| "learning_rate": 4.6436781609195404e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8787130117416382, | |
| "reward_std": 0.020657163113355637, | |
| "rewards/accuracy_reward": 0.8793638944625854, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.609375, | |
| "epoch": 5.3678160919540225, | |
| "grad_norm": 2.607546823999089, | |
| "kl": 0.04638671875, | |
| "learning_rate": 4.632183908045977e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8970588445663452, | |
| "reward_std": 0.018911032006144524, | |
| "rewards/accuracy_reward": 0.8970588445663452, | |
| "rewards/format_reward": 1.0, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.97526550292969, | |
| "epoch": 5.379310344827586, | |
| "grad_norm": 2.615397696144289, | |
| "kl": 0.04345703125, | |
| "learning_rate": 4.620689655172413e-07, | |
| "loss": 0.0018, | |
| "reward": 1.881915807723999, | |
| "reward_std": 0.01975308358669281, | |
| "rewards/accuracy_reward": 0.881915807723999, | |
| "rewards/format_reward": 1.0, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.55534362792969, | |
| "epoch": 5.390804597701149, | |
| "grad_norm": 1.9430369662373463, | |
| "kl": 0.049560546875, | |
| "learning_rate": 4.6091954022988506e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8931167125701904, | |
| "reward_std": 0.021739525720477104, | |
| "rewards/accuracy_reward": 0.8937677145004272, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.79362487792969, | |
| "epoch": 5.402298850574713, | |
| "grad_norm": 2.0918662703233295, | |
| "kl": 0.0498046875, | |
| "learning_rate": 4.597701149425287e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9036979675292969, | |
| "reward_std": 0.015338393859565258, | |
| "rewards/accuracy_reward": 0.9036981463432312, | |
| "rewards/format_reward": 1.0, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.98828125, | |
| "epoch": 5.413793103448276, | |
| "grad_norm": 1.6066027943146437, | |
| "kl": 0.043701171875, | |
| "learning_rate": 4.586206896551724e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9139502048492432, | |
| "reward_std": 0.01869853213429451, | |
| "rewards/accuracy_reward": 0.9139501452445984, | |
| "rewards/format_reward": 1.0, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.34114837646484, | |
| "epoch": 5.425287356321839, | |
| "grad_norm": 2.0098215318578716, | |
| "kl": 0.047607421875, | |
| "learning_rate": 4.574712643678161e-07, | |
| "loss": 0.002, | |
| "reward": 1.8916778564453125, | |
| "reward_std": 0.020163239911198616, | |
| "rewards/accuracy_reward": 0.8916778564453125, | |
| "rewards/format_reward": 1.0, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.19271087646484, | |
| "epoch": 5.436781609195402, | |
| "grad_norm": 1.9402327167023088, | |
| "kl": 0.05029296875, | |
| "learning_rate": 4.563218390804597e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8789701461791992, | |
| "reward_std": 0.023091118782758713, | |
| "rewards/accuracy_reward": 0.8796213865280151, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.625, | |
| "epoch": 5.448275862068965, | |
| "grad_norm": 1.5800586554447937, | |
| "kl": 0.04443359375, | |
| "learning_rate": 4.5517241379310346e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8999593257904053, | |
| "reward_std": 0.019150175154209137, | |
| "rewards/accuracy_reward": 0.8999593257904053, | |
| "rewards/format_reward": 1.0, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.41732025146484, | |
| "epoch": 5.459770114942529, | |
| "grad_norm": 1.3572231730795086, | |
| "kl": 0.048583984375, | |
| "learning_rate": 4.540229885057471e-07, | |
| "loss": 0.002, | |
| "reward": 1.9113028049468994, | |
| "reward_std": 0.018931671977043152, | |
| "rewards/accuracy_reward": 0.9113027453422546, | |
| "rewards/format_reward": 1.0, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.810546875, | |
| "epoch": 5.471264367816092, | |
| "grad_norm": 1.4707638373352834, | |
| "kl": 0.04931640625, | |
| "learning_rate": 4.528735632183908e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9164535999298096, | |
| "reward_std": 0.01708863489329815, | |
| "rewards/accuracy_reward": 0.91645348072052, | |
| "rewards/format_reward": 1.0, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.80013275146484, | |
| "epoch": 5.482758620689655, | |
| "grad_norm": 1.6157878642932322, | |
| "kl": 0.0458984375, | |
| "learning_rate": 4.5172413793103447e-07, | |
| "loss": 0.0019, | |
| "reward": 1.897383689880371, | |
| "reward_std": 0.019057631492614746, | |
| "rewards/accuracy_reward": 0.8973836302757263, | |
| "rewards/format_reward": 1.0, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.48372650146484, | |
| "epoch": 5.494252873563219, | |
| "grad_norm": 2.039464641047124, | |
| "kl": 0.048828125, | |
| "learning_rate": 4.505747126436781e-07, | |
| "loss": 0.002, | |
| "reward": 1.8875281810760498, | |
| "reward_std": 0.02109697088599205, | |
| "rewards/accuracy_reward": 0.8875283002853394, | |
| "rewards/format_reward": 1.0, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.49870300292969, | |
| "epoch": 5.505747126436781, | |
| "grad_norm": 2.039169092715578, | |
| "kl": 0.046142578125, | |
| "learning_rate": 4.494252873563218e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8861839771270752, | |
| "reward_std": 0.02034366875886917, | |
| "rewards/accuracy_reward": 0.8861838579177856, | |
| "rewards/format_reward": 1.0, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.89453125, | |
| "epoch": 5.517241379310345, | |
| "grad_norm": 1.4686995511345136, | |
| "kl": 0.050537109375, | |
| "learning_rate": 4.482758620689655e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9063835144042969, | |
| "reward_std": 0.018992867320775986, | |
| "rewards/accuracy_reward": 0.9070345163345337, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.40690612792969, | |
| "epoch": 5.528735632183908, | |
| "grad_norm": 2.232648278432696, | |
| "kl": 0.04296875, | |
| "learning_rate": 4.471264367816092e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8770596981048584, | |
| "reward_std": 0.02336341142654419, | |
| "rewards/accuracy_reward": 0.8770596981048584, | |
| "rewards/format_reward": 1.0, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.29427337646484, | |
| "epoch": 5.540229885057471, | |
| "grad_norm": 1.9103143500415654, | |
| "kl": 0.04833984375, | |
| "learning_rate": 4.4597701149425287e-07, | |
| "loss": 0.002, | |
| "reward": 1.8889049291610718, | |
| "reward_std": 0.019074462354183197, | |
| "rewards/accuracy_reward": 0.8889049887657166, | |
| "rewards/format_reward": 1.0, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.26171875, | |
| "epoch": 5.551724137931035, | |
| "grad_norm": 1.5192873928817636, | |
| "kl": 0.041015625, | |
| "learning_rate": 4.4482758620689656e-07, | |
| "loss": 0.0017, | |
| "reward": 1.882556438446045, | |
| "reward_std": 0.02097918465733528, | |
| "rewards/accuracy_reward": 0.8825565576553345, | |
| "rewards/format_reward": 1.0, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.24284362792969, | |
| "epoch": 5.563218390804598, | |
| "grad_norm": 4.569269242461156, | |
| "kl": 0.047607421875, | |
| "learning_rate": 4.436781609195402e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9005216360092163, | |
| "reward_std": 0.02364097349345684, | |
| "rewards/accuracy_reward": 0.9011726379394531, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.662109375, | |
| "epoch": 5.574712643678161, | |
| "grad_norm": 1.7930715875513572, | |
| "kl": 0.045654296875, | |
| "learning_rate": 4.425287356321839e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9116904735565186, | |
| "reward_std": 0.016724035143852234, | |
| "rewards/accuracy_reward": 0.9116904139518738, | |
| "rewards/format_reward": 1.0, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.36003112792969, | |
| "epoch": 5.586206896551724, | |
| "grad_norm": 4.334056364497592, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.413793103448276e-07, | |
| "loss": 0.0018, | |
| "reward": 1.895902156829834, | |
| "reward_std": 0.02204793691635132, | |
| "rewards/accuracy_reward": 0.8959023356437683, | |
| "rewards/format_reward": 1.0, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.138671875, | |
| "epoch": 5.597701149425287, | |
| "grad_norm": 1.9027704497320623, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.402298850574712e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8900482654571533, | |
| "reward_std": 0.019576409831643105, | |
| "rewards/accuracy_reward": 0.8900482654571533, | |
| "rewards/format_reward": 1.0, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.41471862792969, | |
| "epoch": 5.609195402298851, | |
| "grad_norm": 3.118890962601994, | |
| "kl": 0.044189453125, | |
| "learning_rate": 4.3908045977011495e-07, | |
| "loss": 0.0018, | |
| "reward": 1.890057921409607, | |
| "reward_std": 0.01878949999809265, | |
| "rewards/accuracy_reward": 0.8907088041305542, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.119140625, | |
| "epoch": 5.620689655172414, | |
| "grad_norm": 1.369384680255404, | |
| "kl": 0.045166015625, | |
| "learning_rate": 4.379310344827586e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8981773853302002, | |
| "reward_std": 0.020252332091331482, | |
| "rewards/accuracy_reward": 0.8981773853302002, | |
| "rewards/format_reward": 1.0, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.96028900146484, | |
| "epoch": 5.6321839080459775, | |
| "grad_norm": 1.3682700993582457, | |
| "kl": 0.04150390625, | |
| "learning_rate": 4.367816091954023e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8991467952728271, | |
| "reward_std": 0.019649513065814972, | |
| "rewards/accuracy_reward": 0.8991466760635376, | |
| "rewards/format_reward": 1.0, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.962890625, | |
| "epoch": 5.64367816091954, | |
| "grad_norm": 1.7998521186414984, | |
| "kl": 0.046875, | |
| "learning_rate": 4.3563218390804597e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8821454048156738, | |
| "reward_std": 0.019271574914455414, | |
| "rewards/accuracy_reward": 0.8821454048156738, | |
| "rewards/format_reward": 1.0, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.05208587646484, | |
| "epoch": 5.655172413793103, | |
| "grad_norm": 2.074675270916482, | |
| "kl": 0.046875, | |
| "learning_rate": 4.344827586206896e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9152133464813232, | |
| "reward_std": 0.019003426656126976, | |
| "rewards/accuracy_reward": 0.9152132868766785, | |
| "rewards/format_reward": 1.0, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.97396087646484, | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 1.6500508152691888, | |
| "kl": 0.047607421875, | |
| "learning_rate": 4.3333333333333335e-07, | |
| "loss": 0.002, | |
| "reward": 1.9029719829559326, | |
| "reward_std": 0.019596107304096222, | |
| "rewards/accuracy_reward": 0.9029721021652222, | |
| "rewards/format_reward": 1.0, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.75130462646484, | |
| "epoch": 5.67816091954023, | |
| "grad_norm": 1.7635232706402635, | |
| "kl": 0.043701171875, | |
| "learning_rate": 4.32183908045977e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8977859020233154, | |
| "reward_std": 0.01936359517276287, | |
| "rewards/accuracy_reward": 0.8977858424186707, | |
| "rewards/format_reward": 1.0, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.8671875, | |
| "epoch": 5.689655172413794, | |
| "grad_norm": 7.5630569258900815, | |
| "kl": 0.049560546875, | |
| "learning_rate": 4.310344827586206e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8984394073486328, | |
| "reward_std": 0.020785300061106682, | |
| "rewards/accuracy_reward": 0.8984395265579224, | |
| "rewards/format_reward": 1.0, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.19075775146484, | |
| "epoch": 5.7011494252873565, | |
| "grad_norm": 1.9789378304791254, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.2988505747126437e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8898248672485352, | |
| "reward_std": 0.019473157823085785, | |
| "rewards/accuracy_reward": 0.8898249864578247, | |
| "rewards/format_reward": 1.0, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.52214050292969, | |
| "epoch": 5.712643678160919, | |
| "grad_norm": 2.230731469046121, | |
| "kl": 0.046875, | |
| "learning_rate": 4.28735632183908e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8999899625778198, | |
| "reward_std": 0.017196331173181534, | |
| "rewards/accuracy_reward": 0.8999900221824646, | |
| "rewards/format_reward": 1.0, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.529296875, | |
| "epoch": 5.724137931034483, | |
| "grad_norm": 1.8172179930249694, | |
| "kl": 0.04736328125, | |
| "learning_rate": 4.2758620689655174e-07, | |
| "loss": 0.002, | |
| "reward": 1.9146230220794678, | |
| "reward_std": 0.016667529940605164, | |
| "rewards/accuracy_reward": 0.9146231412887573, | |
| "rewards/format_reward": 1.0, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.14323425292969, | |
| "epoch": 5.735632183908046, | |
| "grad_norm": 1.7799144316476456, | |
| "kl": 0.051025390625, | |
| "learning_rate": 4.264367816091954e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8982200622558594, | |
| "reward_std": 0.01865963265299797, | |
| "rewards/accuracy_reward": 0.8982200026512146, | |
| "rewards/format_reward": 1.0, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.78060150146484, | |
| "epoch": 5.747126436781609, | |
| "grad_norm": 3.2056367063069513, | |
| "kl": 0.045166015625, | |
| "learning_rate": 4.25287356321839e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9110320806503296, | |
| "reward_std": 0.018477408215403557, | |
| "rewards/accuracy_reward": 0.9110321998596191, | |
| "rewards/format_reward": 1.0, | |
| "step": 500 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.09375, | |
| "epoch": 5.758620689655173, | |
| "grad_norm": 2.1278601395571775, | |
| "kl": 0.04345703125, | |
| "learning_rate": 4.2413793103448276e-07, | |
| "loss": 0.0018, | |
| "reward": 1.896863341331482, | |
| "reward_std": 0.02090194821357727, | |
| "rewards/accuracy_reward": 0.8981654047966003, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 501 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.63346862792969, | |
| "epoch": 5.7701149425287355, | |
| "grad_norm": 1.933803516191336, | |
| "kl": 0.046875, | |
| "learning_rate": 4.229885057471264e-07, | |
| "loss": 0.002, | |
| "reward": 1.8992946147918701, | |
| "reward_std": 0.01685231737792492, | |
| "rewards/accuracy_reward": 0.8992947340011597, | |
| "rewards/format_reward": 1.0, | |
| "step": 502 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.67903900146484, | |
| "epoch": 5.781609195402299, | |
| "grad_norm": 2.5798280611914204, | |
| "kl": 0.049072265625, | |
| "learning_rate": 4.218390804597701e-07, | |
| "loss": 0.002, | |
| "reward": 1.8981560468673706, | |
| "reward_std": 0.017263038083910942, | |
| "rewards/accuracy_reward": 0.8981560468673706, | |
| "rewards/format_reward": 1.0, | |
| "step": 503 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.14583587646484, | |
| "epoch": 5.793103448275862, | |
| "grad_norm": 2.076692138986448, | |
| "kl": 0.043701171875, | |
| "learning_rate": 4.206896551724138e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8787815570831299, | |
| "reward_std": 0.019007613882422447, | |
| "rewards/accuracy_reward": 0.8787816762924194, | |
| "rewards/format_reward": 1.0, | |
| "step": 504 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.9453125, | |
| "epoch": 5.804597701149425, | |
| "grad_norm": 2.062105078777337, | |
| "kl": 0.049072265625, | |
| "learning_rate": 4.195402298850574e-07, | |
| "loss": 0.002, | |
| "reward": 1.8786977529525757, | |
| "reward_std": 0.018628563731908798, | |
| "rewards/accuracy_reward": 0.8786976337432861, | |
| "rewards/format_reward": 1.0, | |
| "step": 505 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.208984375, | |
| "epoch": 5.816091954022989, | |
| "grad_norm": 3.4179007868834175, | |
| "kl": 0.04443359375, | |
| "learning_rate": 4.1839080459770116e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9090993404388428, | |
| "reward_std": 0.018269993364810944, | |
| "rewards/accuracy_reward": 0.9090994596481323, | |
| "rewards/format_reward": 1.0, | |
| "step": 506 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.859375, | |
| "epoch": 5.827586206896552, | |
| "grad_norm": 1.642778882446284, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.172413793103448e-07, | |
| "loss": 0.0019, | |
| "reward": 1.895350694656372, | |
| "reward_std": 0.017278993502259254, | |
| "rewards/accuracy_reward": 0.8960016965866089, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 507 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.00521087646484, | |
| "epoch": 5.8390804597701145, | |
| "grad_norm": 2.6806085379649702, | |
| "kl": 0.049072265625, | |
| "learning_rate": 4.160919540229885e-07, | |
| "loss": 0.002, | |
| "reward": 1.9006847143173218, | |
| "reward_std": 0.016903840005397797, | |
| "rewards/accuracy_reward": 0.9006847143173218, | |
| "rewards/format_reward": 1.0, | |
| "step": 508 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.69466400146484, | |
| "epoch": 5.850574712643678, | |
| "grad_norm": 3.6854562093571386, | |
| "kl": 0.05859375, | |
| "learning_rate": 4.149425287356322e-07, | |
| "loss": 0.0024, | |
| "reward": 1.908639907836914, | |
| "reward_std": 0.016876667737960815, | |
| "rewards/accuracy_reward": 0.9086400866508484, | |
| "rewards/format_reward": 1.0, | |
| "step": 509 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.81120300292969, | |
| "epoch": 5.862068965517241, | |
| "grad_norm": 3.453501517404399, | |
| "kl": 0.0517578125, | |
| "learning_rate": 4.1379310344827586e-07, | |
| "loss": 0.0022, | |
| "reward": 1.903754472732544, | |
| "reward_std": 0.019029080867767334, | |
| "rewards/accuracy_reward": 0.9044055342674255, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 510 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.5859375, | |
| "epoch": 5.873563218390805, | |
| "grad_norm": 1.803202256070274, | |
| "kl": 0.054443359375, | |
| "learning_rate": 4.126436781609195e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8829306364059448, | |
| "reward_std": 0.01749059185385704, | |
| "rewards/accuracy_reward": 0.8829306364059448, | |
| "rewards/format_reward": 1.0, | |
| "step": 511 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.27995300292969, | |
| "epoch": 5.885057471264368, | |
| "grad_norm": 1.5556838404555604, | |
| "kl": 0.044677734375, | |
| "learning_rate": 4.114942528735632e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9034370183944702, | |
| "reward_std": 0.018126491457223892, | |
| "rewards/accuracy_reward": 0.903437077999115, | |
| "rewards/format_reward": 1.0, | |
| "step": 512 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.17057800292969, | |
| "epoch": 5.896551724137931, | |
| "grad_norm": 3.4957318575488383, | |
| "kl": 0.0498046875, | |
| "learning_rate": 4.103448275862069e-07, | |
| "loss": 0.002, | |
| "reward": 1.9079878330230713, | |
| "reward_std": 0.01815355196595192, | |
| "rewards/accuracy_reward": 0.9079879522323608, | |
| "rewards/format_reward": 1.0, | |
| "step": 513 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.59765625, | |
| "epoch": 5.908045977011494, | |
| "grad_norm": 2.707726805902382, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.0919540229885057e-07, | |
| "loss": 0.0019, | |
| "reward": 1.884354591369629, | |
| "reward_std": 0.018658233806490898, | |
| "rewards/accuracy_reward": 0.8843547105789185, | |
| "rewards/format_reward": 1.0, | |
| "step": 514 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.71354675292969, | |
| "epoch": 5.919540229885057, | |
| "grad_norm": 1.4965768623048503, | |
| "kl": 0.04931640625, | |
| "learning_rate": 4.0804597701149426e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8936374187469482, | |
| "reward_std": 0.01746196486055851, | |
| "rewards/accuracy_reward": 0.8936373591423035, | |
| "rewards/format_reward": 1.0, | |
| "step": 515 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.97982025146484, | |
| "epoch": 5.931034482758621, | |
| "grad_norm": 2.9479475900202115, | |
| "kl": 0.046630859375, | |
| "learning_rate": 4.068965517241379e-07, | |
| "loss": 0.0019, | |
| "reward": 1.902561902999878, | |
| "reward_std": 0.01716381497681141, | |
| "rewards/accuracy_reward": 0.9025619029998779, | |
| "rewards/format_reward": 1.0, | |
| "step": 516 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.25456237792969, | |
| "epoch": 5.942528735632184, | |
| "grad_norm": 2.5970229292314677, | |
| "kl": 0.05029296875, | |
| "learning_rate": 4.057471264367816e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8918704986572266, | |
| "reward_std": 0.019317103549838066, | |
| "rewards/accuracy_reward": 0.8918704986572266, | |
| "rewards/format_reward": 1.0, | |
| "step": 517 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.76888275146484, | |
| "epoch": 5.954022988505747, | |
| "grad_norm": 1.9125407656929412, | |
| "kl": 0.045654296875, | |
| "learning_rate": 4.045977011494253e-07, | |
| "loss": 0.0019, | |
| "reward": 1.89088773727417, | |
| "reward_std": 0.0201788991689682, | |
| "rewards/accuracy_reward": 0.8915388584136963, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 518 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.34375, | |
| "epoch": 5.9655172413793105, | |
| "grad_norm": 1.906615415914849, | |
| "kl": 0.05322265625, | |
| "learning_rate": 4.034482758620689e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8955434560775757, | |
| "reward_std": 0.017683900892734528, | |
| "rewards/accuracy_reward": 0.8955433368682861, | |
| "rewards/format_reward": 1.0, | |
| "step": 519 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.95573425292969, | |
| "epoch": 5.977011494252873, | |
| "grad_norm": 1.930378197428578, | |
| "kl": 0.052734375, | |
| "learning_rate": 4.0229885057471266e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9113073348999023, | |
| "reward_std": 0.015919553115963936, | |
| "rewards/accuracy_reward": 0.9113074541091919, | |
| "rewards/format_reward": 1.0, | |
| "step": 520 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.345703125, | |
| "epoch": 5.988505747126437, | |
| "grad_norm": 2.559459936562671, | |
| "kl": 0.048828125, | |
| "learning_rate": 4.011494252873563e-07, | |
| "loss": 0.0021, | |
| "reward": 1.903090476989746, | |
| "reward_std": 0.017953019589185715, | |
| "rewards/accuracy_reward": 0.9030904769897461, | |
| "rewards/format_reward": 1.0, | |
| "step": 521 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.29354095458984, | |
| "epoch": 6.0, | |
| "grad_norm": 1.320684949997662, | |
| "kl": 0.043212890625, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9057987928390503, | |
| "reward_std": 0.014725634828209877, | |
| "rewards/accuracy_reward": 0.9057987332344055, | |
| "rewards/format_reward": 1.0, | |
| "step": 522 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.42317962646484, | |
| "epoch": 6.011494252873563, | |
| "grad_norm": 1.785801287990398, | |
| "kl": 0.046875, | |
| "learning_rate": 3.9885057471264367e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8730857372283936, | |
| "reward_std": 0.016687028110027313, | |
| "rewards/accuracy_reward": 0.8730856776237488, | |
| "rewards/format_reward": 1.0, | |
| "step": 523 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.64778900146484, | |
| "epoch": 6.022988505747127, | |
| "grad_norm": 2.818216992605221, | |
| "kl": 0.05126953125, | |
| "learning_rate": 3.977011494252873e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8802497386932373, | |
| "reward_std": 0.019351143389940262, | |
| "rewards/accuracy_reward": 0.8802497386932373, | |
| "rewards/format_reward": 1.0, | |
| "step": 524 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.970703125, | |
| "epoch": 6.0344827586206895, | |
| "grad_norm": 1.373324008322693, | |
| "kl": 0.04833984375, | |
| "learning_rate": 3.9655172413793105e-07, | |
| "loss": 0.002, | |
| "reward": 1.8902562856674194, | |
| "reward_std": 0.017598673701286316, | |
| "rewards/accuracy_reward": 0.8902562856674194, | |
| "rewards/format_reward": 1.0, | |
| "step": 525 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.61263275146484, | |
| "epoch": 6.045977011494253, | |
| "grad_norm": 1.4203897246386383, | |
| "kl": 0.046630859375, | |
| "learning_rate": 3.954022988505747e-07, | |
| "loss": 0.002, | |
| "reward": 1.896735668182373, | |
| "reward_std": 0.0163632333278656, | |
| "rewards/accuracy_reward": 0.896735668182373, | |
| "rewards/format_reward": 1.0, | |
| "step": 526 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.630859375, | |
| "epoch": 6.057471264367816, | |
| "grad_norm": 1.9485819155646742, | |
| "kl": 0.047607421875, | |
| "learning_rate": 3.942528735632183e-07, | |
| "loss": 0.002, | |
| "reward": 1.8787517547607422, | |
| "reward_std": 0.020119938999414444, | |
| "rewards/accuracy_reward": 0.879402756690979, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 527 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.02214050292969, | |
| "epoch": 6.068965517241379, | |
| "grad_norm": 2.495312428574511, | |
| "kl": 0.050537109375, | |
| "learning_rate": 3.9310344827586207e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9046882390975952, | |
| "reward_std": 0.01929212175309658, | |
| "rewards/accuracy_reward": 0.9046882390975952, | |
| "rewards/format_reward": 1.0, | |
| "step": 528 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.466796875, | |
| "epoch": 6.080459770114943, | |
| "grad_norm": 1.8085811940047551, | |
| "kl": 0.048583984375, | |
| "learning_rate": 3.919540229885057e-07, | |
| "loss": 0.002, | |
| "reward": 1.8941335678100586, | |
| "reward_std": 0.0205199234187603, | |
| "rewards/accuracy_reward": 0.894133448600769, | |
| "rewards/format_reward": 1.0, | |
| "step": 529 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.52018737792969, | |
| "epoch": 6.091954022988506, | |
| "grad_norm": 5.365446513796388, | |
| "kl": 0.04638671875, | |
| "learning_rate": 3.9080459770114945e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9010367393493652, | |
| "reward_std": 0.017713043838739395, | |
| "rewards/accuracy_reward": 0.9010368585586548, | |
| "rewards/format_reward": 1.0, | |
| "step": 530 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.74089050292969, | |
| "epoch": 6.103448275862069, | |
| "grad_norm": 2.828191865732173, | |
| "kl": 0.049072265625, | |
| "learning_rate": 3.896551724137931e-07, | |
| "loss": 0.002, | |
| "reward": 1.8785228729248047, | |
| "reward_std": 0.02002035826444626, | |
| "rewards/accuracy_reward": 0.8785228133201599, | |
| "rewards/format_reward": 1.0, | |
| "step": 531 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.07747650146484, | |
| "epoch": 6.114942528735632, | |
| "grad_norm": 4.047622387425111, | |
| "kl": 0.04443359375, | |
| "learning_rate": 3.885057471264367e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8791890144348145, | |
| "reward_std": 0.01751401089131832, | |
| "rewards/accuracy_reward": 0.8791891932487488, | |
| "rewards/format_reward": 1.0, | |
| "step": 532 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.36263275146484, | |
| "epoch": 6.126436781609195, | |
| "grad_norm": 1.3128782551208291, | |
| "kl": 0.044189453125, | |
| "learning_rate": 3.8735632183908046e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9140952825546265, | |
| "reward_std": 0.01683102920651436, | |
| "rewards/accuracy_reward": 0.9140952825546265, | |
| "rewards/format_reward": 1.0, | |
| "step": 533 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.18815612792969, | |
| "epoch": 6.137931034482759, | |
| "grad_norm": 1.933372124731939, | |
| "kl": 0.038818359375, | |
| "learning_rate": 3.862068965517241e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8693971633911133, | |
| "reward_std": 0.016525980085134506, | |
| "rewards/accuracy_reward": 0.8693971633911133, | |
| "rewards/format_reward": 1.0, | |
| "step": 534 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.75521087646484, | |
| "epoch": 6.149425287356322, | |
| "grad_norm": 1.6333597141209881, | |
| "kl": 0.04150390625, | |
| "learning_rate": 3.850574712643678e-07, | |
| "loss": 0.0018, | |
| "reward": 1.90386962890625, | |
| "reward_std": 0.015211975201964378, | |
| "rewards/accuracy_reward": 0.9038697481155396, | |
| "rewards/format_reward": 1.0, | |
| "step": 535 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.96549987792969, | |
| "epoch": 6.160919540229885, | |
| "grad_norm": 1.2717871465323818, | |
| "kl": 0.0419921875, | |
| "learning_rate": 3.839080459770115e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8973326683044434, | |
| "reward_std": 0.01976935565471649, | |
| "rewards/accuracy_reward": 0.8979837894439697, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 536 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.41341400146484, | |
| "epoch": 6.172413793103448, | |
| "grad_norm": 2.481211895104965, | |
| "kl": 0.04248046875, | |
| "learning_rate": 3.827586206896551e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9014673233032227, | |
| "reward_std": 0.018813788890838623, | |
| "rewards/accuracy_reward": 0.9014673233032227, | |
| "rewards/format_reward": 1.0, | |
| "step": 537 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.07292175292969, | |
| "epoch": 6.183908045977011, | |
| "grad_norm": 1.7579797956745593, | |
| "kl": 0.03759765625, | |
| "learning_rate": 3.8160919540229886e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8949092626571655, | |
| "reward_std": 0.018055422231554985, | |
| "rewards/accuracy_reward": 0.8955603837966919, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 538 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.69140625, | |
| "epoch": 6.195402298850575, | |
| "grad_norm": 2.0716783964182257, | |
| "kl": 0.045166015625, | |
| "learning_rate": 3.804597701149425e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8800392150878906, | |
| "reward_std": 0.02071218751370907, | |
| "rewards/accuracy_reward": 0.8806902766227722, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 539 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82942962646484, | |
| "epoch": 6.206896551724138, | |
| "grad_norm": 3.1245967017230347, | |
| "kl": 0.0390625, | |
| "learning_rate": 3.793103448275862e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8988111019134521, | |
| "reward_std": 0.01829328015446663, | |
| "rewards/accuracy_reward": 0.8988110423088074, | |
| "rewards/format_reward": 1.0, | |
| "step": 540 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.06901550292969, | |
| "epoch": 6.218390804597701, | |
| "grad_norm": 4.050364995136617, | |
| "kl": 0.05517578125, | |
| "learning_rate": 3.781609195402299e-07, | |
| "loss": 0.0023, | |
| "reward": 1.8841352462768555, | |
| "reward_std": 0.022105498239398003, | |
| "rewards/accuracy_reward": 0.8847863674163818, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 541 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82747650146484, | |
| "epoch": 6.2298850574712645, | |
| "grad_norm": 1.8078253085183804, | |
| "kl": 0.038330078125, | |
| "learning_rate": 3.7701149425287357e-07, | |
| "loss": 0.0016, | |
| "reward": 1.910055160522461, | |
| "reward_std": 0.021085239946842194, | |
| "rewards/accuracy_reward": 0.9107062816619873, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 542 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.75325775146484, | |
| "epoch": 6.241379310344827, | |
| "grad_norm": 1.5902924468702986, | |
| "kl": 0.048583984375, | |
| "learning_rate": 3.758620689655172e-07, | |
| "loss": 0.002, | |
| "reward": 1.8845319747924805, | |
| "reward_std": 0.022094160318374634, | |
| "rewards/accuracy_reward": 0.8845321536064148, | |
| "rewards/format_reward": 1.0, | |
| "step": 543 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.82682800292969, | |
| "epoch": 6.252873563218391, | |
| "grad_norm": 2.0704586256752253, | |
| "kl": 0.042236328125, | |
| "learning_rate": 3.747126436781609e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8901569843292236, | |
| "reward_std": 0.020656492561101913, | |
| "rewards/accuracy_reward": 0.8901569247245789, | |
| "rewards/format_reward": 1.0, | |
| "step": 544 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.267578125, | |
| "epoch": 6.264367816091954, | |
| "grad_norm": 2.4508205899480724, | |
| "kl": 0.04248046875, | |
| "learning_rate": 3.735632183908046e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8985986709594727, | |
| "reward_std": 0.021605072543025017, | |
| "rewards/accuracy_reward": 0.8992499113082886, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 545 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.693359375, | |
| "epoch": 6.275862068965517, | |
| "grad_norm": 1.5807641157109527, | |
| "kl": 0.0439453125, | |
| "learning_rate": 3.7241379310344827e-07, | |
| "loss": 0.0018, | |
| "reward": 1.903763771057129, | |
| "reward_std": 0.020712215453386307, | |
| "rewards/accuracy_reward": 0.9037638902664185, | |
| "rewards/format_reward": 1.0, | |
| "step": 546 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.677734375, | |
| "epoch": 6.287356321839081, | |
| "grad_norm": 1.8174266396212175, | |
| "kl": 0.043212890625, | |
| "learning_rate": 3.7126436781609196e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8896994590759277, | |
| "reward_std": 0.02042810432612896, | |
| "rewards/accuracy_reward": 0.8896996378898621, | |
| "rewards/format_reward": 1.0, | |
| "step": 547 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.779296875, | |
| "epoch": 6.2988505747126435, | |
| "grad_norm": 1.3923115045110266, | |
| "kl": 0.03857421875, | |
| "learning_rate": 3.701149425287356e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8963736295700073, | |
| "reward_std": 0.021433323621749878, | |
| "rewards/accuracy_reward": 0.8963736295700073, | |
| "rewards/format_reward": 1.0, | |
| "step": 548 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.72526550292969, | |
| "epoch": 6.310344827586207, | |
| "grad_norm": 1.7072054434768456, | |
| "kl": 0.041748046875, | |
| "learning_rate": 3.689655172413793e-07, | |
| "loss": 0.0017, | |
| "reward": 1.91050124168396, | |
| "reward_std": 0.02200714498758316, | |
| "rewards/accuracy_reward": 0.9111522436141968, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 549 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.98567962646484, | |
| "epoch": 6.32183908045977, | |
| "grad_norm": 1.3415596828035086, | |
| "kl": 0.0390625, | |
| "learning_rate": 3.67816091954023e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9038093090057373, | |
| "reward_std": 0.01953584887087345, | |
| "rewards/accuracy_reward": 0.9044603109359741, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 550 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.85482025146484, | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 1.2624977576240533, | |
| "kl": 0.039306640625, | |
| "learning_rate": 3.666666666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.915178656578064, | |
| "reward_std": 0.017109807580709457, | |
| "rewards/accuracy_reward": 0.9151787161827087, | |
| "rewards/format_reward": 1.0, | |
| "step": 551 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.31185150146484, | |
| "epoch": 6.344827586206897, | |
| "grad_norm": 1.7459891843430122, | |
| "kl": 0.0400390625, | |
| "learning_rate": 3.6551724137931036e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8864727020263672, | |
| "reward_std": 0.02323821559548378, | |
| "rewards/accuracy_reward": 0.8871237635612488, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 552 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.73893737792969, | |
| "epoch": 6.35632183908046, | |
| "grad_norm": 2.027513640363097, | |
| "kl": 0.042724609375, | |
| "learning_rate": 3.64367816091954e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9012677669525146, | |
| "reward_std": 0.01830562949180603, | |
| "rewards/accuracy_reward": 0.9012677073478699, | |
| "rewards/format_reward": 1.0, | |
| "step": 553 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.47200775146484, | |
| "epoch": 6.3678160919540225, | |
| "grad_norm": 3.2725385424225064, | |
| "kl": 0.0419921875, | |
| "learning_rate": 3.632183908045977e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9009044170379639, | |
| "reward_std": 0.019121093675494194, | |
| "rewards/accuracy_reward": 0.9009043574333191, | |
| "rewards/format_reward": 1.0, | |
| "step": 554 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.43034362792969, | |
| "epoch": 6.379310344827586, | |
| "grad_norm": 2.1062592964548665, | |
| "kl": 0.0419921875, | |
| "learning_rate": 3.620689655172414e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9104199409484863, | |
| "reward_std": 0.020279204472899437, | |
| "rewards/accuracy_reward": 0.9110710024833679, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 555 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.58919525146484, | |
| "epoch": 6.390804597701149, | |
| "grad_norm": 1.4371380643185012, | |
| "kl": 0.04150390625, | |
| "learning_rate": 3.60919540229885e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9064548015594482, | |
| "reward_std": 0.019678324460983276, | |
| "rewards/accuracy_reward": 0.9077568054199219, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 556 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.373046875, | |
| "epoch": 6.402298850574713, | |
| "grad_norm": 1.4862995937435215, | |
| "kl": 0.041259765625, | |
| "learning_rate": 3.5977011494252875e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9054267406463623, | |
| "reward_std": 0.02058161422610283, | |
| "rewards/accuracy_reward": 0.9054265022277832, | |
| "rewards/format_reward": 1.0, | |
| "step": 557 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.744140625, | |
| "epoch": 6.413793103448276, | |
| "grad_norm": 1.4955532746736115, | |
| "kl": 0.04052734375, | |
| "learning_rate": 3.586206896551724e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9132366180419922, | |
| "reward_std": 0.0166233628988266, | |
| "rewards/accuracy_reward": 0.9132366180419922, | |
| "rewards/format_reward": 1.0, | |
| "step": 558 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.3828125, | |
| "epoch": 6.425287356321839, | |
| "grad_norm": 1.5190626491993644, | |
| "kl": 0.038818359375, | |
| "learning_rate": 3.5747126436781603e-07, | |
| "loss": 0.0016, | |
| "reward": 1.9011564254760742, | |
| "reward_std": 0.018908878788352013, | |
| "rewards/accuracy_reward": 0.9018074870109558, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 559 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.10612487792969, | |
| "epoch": 6.436781609195402, | |
| "grad_norm": 2.3464151581290253, | |
| "kl": 0.041259765625, | |
| "learning_rate": 3.5632183908045977e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8979761600494385, | |
| "reward_std": 0.019787931814789772, | |
| "rewards/accuracy_reward": 0.8986272811889648, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 560 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.68685150146484, | |
| "epoch": 6.448275862068965, | |
| "grad_norm": 5.8754331087951925, | |
| "kl": 0.04443359375, | |
| "learning_rate": 3.551724137931034e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9075603485107422, | |
| "reward_std": 0.018795132637023926, | |
| "rewards/accuracy_reward": 0.9075603485107422, | |
| "rewards/format_reward": 1.0, | |
| "step": 561 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.45833587646484, | |
| "epoch": 6.459770114942529, | |
| "grad_norm": 1.995883232305232, | |
| "kl": 0.04296875, | |
| "learning_rate": 3.5402298850574715e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9002068042755127, | |
| "reward_std": 0.019703200086951256, | |
| "rewards/accuracy_reward": 0.9002067446708679, | |
| "rewards/format_reward": 1.0, | |
| "step": 562 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.5234375, | |
| "epoch": 6.471264367816092, | |
| "grad_norm": 2.589878830540706, | |
| "kl": 0.044189453125, | |
| "learning_rate": 3.528735632183908e-07, | |
| "loss": 0.0018, | |
| "reward": 1.898942470550537, | |
| "reward_std": 0.023017754778265953, | |
| "rewards/accuracy_reward": 0.9008955955505371, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 563 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.181640625, | |
| "epoch": 6.482758620689655, | |
| "grad_norm": 2.327789501169785, | |
| "kl": 0.044677734375, | |
| "learning_rate": 3.517241379310344e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9264135360717773, | |
| "reward_std": 0.018227433785796165, | |
| "rewards/accuracy_reward": 0.9264135360717773, | |
| "rewards/format_reward": 1.0, | |
| "step": 564 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.37435150146484, | |
| "epoch": 6.494252873563219, | |
| "grad_norm": 3.3012610405965925, | |
| "kl": 0.044677734375, | |
| "learning_rate": 3.5057471264367817e-07, | |
| "loss": 0.0019, | |
| "reward": 1.882051944732666, | |
| "reward_std": 0.020881911739706993, | |
| "rewards/accuracy_reward": 0.882051944732666, | |
| "rewards/format_reward": 1.0, | |
| "step": 565 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.64388275146484, | |
| "epoch": 6.505747126436781, | |
| "grad_norm": 1.5695423368611925, | |
| "kl": 0.039306640625, | |
| "learning_rate": 3.494252873563218e-07, | |
| "loss": 0.0016, | |
| "reward": 1.9002180099487305, | |
| "reward_std": 0.016297608613967896, | |
| "rewards/accuracy_reward": 0.9002181887626648, | |
| "rewards/format_reward": 1.0, | |
| "step": 566 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.037109375, | |
| "epoch": 6.517241379310345, | |
| "grad_norm": 1.3087146389425561, | |
| "kl": 0.04248046875, | |
| "learning_rate": 3.482758620689655e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9085086584091187, | |
| "reward_std": 0.017730899155139923, | |
| "rewards/accuracy_reward": 0.9085086584091187, | |
| "rewards/format_reward": 1.0, | |
| "step": 567 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.88607025146484, | |
| "epoch": 6.528735632183908, | |
| "grad_norm": 2.971298257475111, | |
| "kl": 0.046142578125, | |
| "learning_rate": 3.471264367816092e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8897184133529663, | |
| "reward_std": 0.022931650280952454, | |
| "rewards/accuracy_reward": 0.8910205364227295, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 568 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.04362487792969, | |
| "epoch": 6.540229885057471, | |
| "grad_norm": 2.2629176129306514, | |
| "kl": 0.04833984375, | |
| "learning_rate": 3.4597701149425287e-07, | |
| "loss": 0.002, | |
| "reward": 1.888806700706482, | |
| "reward_std": 0.022193504497408867, | |
| "rewards/accuracy_reward": 0.8901089429855347, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 569 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.546875, | |
| "epoch": 6.551724137931035, | |
| "grad_norm": 1.6844269731795065, | |
| "kl": 0.0458984375, | |
| "learning_rate": 3.4482758620689656e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8828990459442139, | |
| "reward_std": 0.018883569166064262, | |
| "rewards/accuracy_reward": 0.8835498690605164, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 570 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.8671875, | |
| "epoch": 6.563218390804598, | |
| "grad_norm": 2.284457618723532, | |
| "kl": 0.044921875, | |
| "learning_rate": 3.436781609195402e-07, | |
| "loss": 0.0018, | |
| "reward": 1.884222388267517, | |
| "reward_std": 0.018331531435251236, | |
| "rewards/accuracy_reward": 0.8848733305931091, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 571 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.43099212646484, | |
| "epoch": 6.574712643678161, | |
| "grad_norm": 1.446359862116147, | |
| "kl": 0.04345703125, | |
| "learning_rate": 3.425287356321839e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9134483337402344, | |
| "reward_std": 0.019138170406222343, | |
| "rewards/accuracy_reward": 0.9134482741355896, | |
| "rewards/format_reward": 1.0, | |
| "step": 572 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.00130462646484, | |
| "epoch": 6.586206896551724, | |
| "grad_norm": 5.703380489491454, | |
| "kl": 0.05078125, | |
| "learning_rate": 3.413793103448276e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8881677389144897, | |
| "reward_std": 0.018784310668706894, | |
| "rewards/accuracy_reward": 0.8881677389144897, | |
| "rewards/format_reward": 1.0, | |
| "step": 573 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.427734375, | |
| "epoch": 6.597701149425287, | |
| "grad_norm": 3.5020194713310895, | |
| "kl": 0.04345703125, | |
| "learning_rate": 3.4022988505747127e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8986886739730835, | |
| "reward_std": 0.01682961732149124, | |
| "rewards/accuracy_reward": 0.8986887335777283, | |
| "rewards/format_reward": 1.0, | |
| "step": 574 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.43424987792969, | |
| "epoch": 6.609195402298851, | |
| "grad_norm": 2.303566130374915, | |
| "kl": 0.04296875, | |
| "learning_rate": 3.390804597701149e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8962440490722656, | |
| "reward_std": 0.02106454037129879, | |
| "rewards/accuracy_reward": 0.8975462913513184, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 575 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.39583587646484, | |
| "epoch": 6.620689655172414, | |
| "grad_norm": 1.6584764881239733, | |
| "kl": 0.045166015625, | |
| "learning_rate": 3.379310344827586e-07, | |
| "loss": 0.0019, | |
| "reward": 1.88081693649292, | |
| "reward_std": 0.018141578882932663, | |
| "rewards/accuracy_reward": 0.8808168768882751, | |
| "rewards/format_reward": 1.0, | |
| "step": 576 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.30143737792969, | |
| "epoch": 6.6321839080459775, | |
| "grad_norm": 4.350037902113492, | |
| "kl": 0.048828125, | |
| "learning_rate": 3.367816091954023e-07, | |
| "loss": 0.002, | |
| "reward": 1.8807071447372437, | |
| "reward_std": 0.019819162786006927, | |
| "rewards/accuracy_reward": 0.8807072043418884, | |
| "rewards/format_reward": 1.0, | |
| "step": 577 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.64453125, | |
| "epoch": 6.64367816091954, | |
| "grad_norm": 1.8183763915247184, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.35632183908046e-07, | |
| "loss": 0.002, | |
| "reward": 1.8958330154418945, | |
| "reward_std": 0.017030756920576096, | |
| "rewards/accuracy_reward": 0.8958329558372498, | |
| "rewards/format_reward": 1.0, | |
| "step": 578 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.712890625, | |
| "epoch": 6.655172413793103, | |
| "grad_norm": 1.6800458122792343, | |
| "kl": 0.045166015625, | |
| "learning_rate": 3.3448275862068966e-07, | |
| "loss": 0.0019, | |
| "reward": 1.899322748184204, | |
| "reward_std": 0.01855292171239853, | |
| "rewards/accuracy_reward": 0.8993227481842041, | |
| "rewards/format_reward": 1.0, | |
| "step": 579 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.41862487792969, | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 1.5389866031610586, | |
| "kl": 0.048828125, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9041564464569092, | |
| "reward_std": 0.019357208162546158, | |
| "rewards/accuracy_reward": 0.9041563868522644, | |
| "rewards/format_reward": 1.0, | |
| "step": 580 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.85482025146484, | |
| "epoch": 6.67816091954023, | |
| "grad_norm": 1.854004391136165, | |
| "kl": 0.046630859375, | |
| "learning_rate": 3.32183908045977e-07, | |
| "loss": 0.0019, | |
| "reward": 1.896866798400879, | |
| "reward_std": 0.02253865823149681, | |
| "rewards/accuracy_reward": 0.8981690406799316, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 581 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.93359375, | |
| "epoch": 6.689655172413794, | |
| "grad_norm": 2.018641316913995, | |
| "kl": 0.048828125, | |
| "learning_rate": 3.310344827586207e-07, | |
| "loss": 0.002, | |
| "reward": 1.9180282354354858, | |
| "reward_std": 0.01753108948469162, | |
| "rewards/accuracy_reward": 0.9180280566215515, | |
| "rewards/format_reward": 1.0, | |
| "step": 582 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.14192962646484, | |
| "epoch": 6.7011494252873565, | |
| "grad_norm": 2.590727908860308, | |
| "kl": 0.047119140625, | |
| "learning_rate": 3.298850574712643e-07, | |
| "loss": 0.002, | |
| "reward": 1.8914936780929565, | |
| "reward_std": 0.019188500940799713, | |
| "rewards/accuracy_reward": 0.8921446800231934, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 583 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.40625, | |
| "epoch": 6.712643678160919, | |
| "grad_norm": 1.4866576141042993, | |
| "kl": 0.05029296875, | |
| "learning_rate": 3.2873563218390806e-07, | |
| "loss": 0.0021, | |
| "reward": 1.900843620300293, | |
| "reward_std": 0.02220628783106804, | |
| "rewards/accuracy_reward": 0.9014946818351746, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 584 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.1640625, | |
| "epoch": 6.724137931034483, | |
| "grad_norm": 1.7834180698321564, | |
| "kl": 0.049560546875, | |
| "learning_rate": 3.275862068965517e-07, | |
| "loss": 0.002, | |
| "reward": 1.8919929265975952, | |
| "reward_std": 0.019489863887429237, | |
| "rewards/accuracy_reward": 0.8919928073883057, | |
| "rewards/format_reward": 1.0, | |
| "step": 585 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.11263275146484, | |
| "epoch": 6.735632183908046, | |
| "grad_norm": 2.080295738757342, | |
| "kl": 0.0517578125, | |
| "learning_rate": 3.264367816091954e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9068288803100586, | |
| "reward_std": 0.01798483356833458, | |
| "rewards/accuracy_reward": 0.9068288803100586, | |
| "rewards/format_reward": 1.0, | |
| "step": 586 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.28515625, | |
| "epoch": 6.747126436781609, | |
| "grad_norm": 1.363971268029249, | |
| "kl": 0.05322265625, | |
| "learning_rate": 3.252873563218391e-07, | |
| "loss": 0.0022, | |
| "reward": 1.896977186203003, | |
| "reward_std": 0.01630130037665367, | |
| "rewards/accuracy_reward": 0.8969771265983582, | |
| "rewards/format_reward": 1.0, | |
| "step": 587 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.81120300292969, | |
| "epoch": 6.758620689655173, | |
| "grad_norm": 1.8998231610338354, | |
| "kl": 0.04541015625, | |
| "learning_rate": 3.241379310344827e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8929922580718994, | |
| "reward_std": 0.018269825726747513, | |
| "rewards/accuracy_reward": 0.8929921984672546, | |
| "rewards/format_reward": 1.0, | |
| "step": 588 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.96810150146484, | |
| "epoch": 6.7701149425287355, | |
| "grad_norm": 1.4654423135239003, | |
| "kl": 0.055908203125, | |
| "learning_rate": 3.2298850574712646e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9026551246643066, | |
| "reward_std": 0.018198613077402115, | |
| "rewards/accuracy_reward": 0.9026551246643066, | |
| "rewards/format_reward": 1.0, | |
| "step": 589 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.50521087646484, | |
| "epoch": 6.781609195402299, | |
| "grad_norm": 1.720473724542019, | |
| "kl": 0.05029296875, | |
| "learning_rate": 3.218390804597701e-07, | |
| "loss": 0.0021, | |
| "reward": 1.896925687789917, | |
| "reward_std": 0.016040321439504623, | |
| "rewards/accuracy_reward": 0.8969255685806274, | |
| "rewards/format_reward": 1.0, | |
| "step": 590 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.015625, | |
| "epoch": 6.793103448275862, | |
| "grad_norm": 1.8599065609753198, | |
| "kl": 0.046875, | |
| "learning_rate": 3.2068965517241373e-07, | |
| "loss": 0.002, | |
| "reward": 1.896074652671814, | |
| "reward_std": 0.021062329411506653, | |
| "rewards/accuracy_reward": 0.898027777671814, | |
| "rewards/format_reward": 0.998046875, | |
| "step": 591 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.74870300292969, | |
| "epoch": 6.804597701149425, | |
| "grad_norm": 2.454880208710065, | |
| "kl": 0.04541015625, | |
| "learning_rate": 3.1954022988505747e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9107999801635742, | |
| "reward_std": 0.015166133642196655, | |
| "rewards/accuracy_reward": 0.9108001589775085, | |
| "rewards/format_reward": 1.0, | |
| "step": 592 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.056640625, | |
| "epoch": 6.816091954022989, | |
| "grad_norm": 1.7036855487787057, | |
| "kl": 0.045166015625, | |
| "learning_rate": 3.183908045977011e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8942689895629883, | |
| "reward_std": 0.02249247394502163, | |
| "rewards/accuracy_reward": 0.8949202299118042, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 593 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.830078125, | |
| "epoch": 6.827586206896552, | |
| "grad_norm": 53.62639568378942, | |
| "kl": 0.0458984375, | |
| "learning_rate": 3.1724137931034485e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8942097425460815, | |
| "reward_std": 0.019950736314058304, | |
| "rewards/accuracy_reward": 0.8948608636856079, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 594 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.10286712646484, | |
| "epoch": 6.8390804597701145, | |
| "grad_norm": 5.944879218709257, | |
| "kl": 0.040283203125, | |
| "learning_rate": 3.160919540229885e-07, | |
| "loss": 0.0017, | |
| "reward": 1.89532470703125, | |
| "reward_std": 0.016749953851103783, | |
| "rewards/accuracy_reward": 0.89532470703125, | |
| "rewards/format_reward": 1.0, | |
| "step": 595 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.40560150146484, | |
| "epoch": 6.850574712643678, | |
| "grad_norm": 1.4735604817931964, | |
| "kl": 0.045654296875, | |
| "learning_rate": 3.149425287356321e-07, | |
| "loss": 0.002, | |
| "reward": 1.9018571376800537, | |
| "reward_std": 0.01927165314555168, | |
| "rewards/accuracy_reward": 0.9018572568893433, | |
| "rewards/format_reward": 1.0, | |
| "step": 596 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.650390625, | |
| "epoch": 6.862068965517241, | |
| "grad_norm": 2.3771678349296113, | |
| "kl": 0.04638671875, | |
| "learning_rate": 3.1379310344827587e-07, | |
| "loss": 0.0019, | |
| "reward": 1.902691125869751, | |
| "reward_std": 0.019225675612688065, | |
| "rewards/accuracy_reward": 0.9033421277999878, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 597 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.69661712646484, | |
| "epoch": 6.873563218390805, | |
| "grad_norm": 1.3517485796795934, | |
| "kl": 0.041015625, | |
| "learning_rate": 3.126436781609195e-07, | |
| "loss": 0.0017, | |
| "reward": 1.904984712600708, | |
| "reward_std": 0.01752365753054619, | |
| "rewards/accuracy_reward": 0.9049846529960632, | |
| "rewards/format_reward": 1.0, | |
| "step": 598 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.94792175292969, | |
| "epoch": 6.885057471264368, | |
| "grad_norm": 1.7049030907812799, | |
| "kl": 0.046630859375, | |
| "learning_rate": 3.114942528735632e-07, | |
| "loss": 0.002, | |
| "reward": 1.8980730772018433, | |
| "reward_std": 0.016529429703950882, | |
| "rewards/accuracy_reward": 0.8980730772018433, | |
| "rewards/format_reward": 1.0, | |
| "step": 599 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.34700775146484, | |
| "epoch": 6.896551724137931, | |
| "grad_norm": 1.8048662278319052, | |
| "kl": 0.04638671875, | |
| "learning_rate": 3.103448275862069e-07, | |
| "loss": 0.002, | |
| "reward": 1.9081945419311523, | |
| "reward_std": 0.01681654527783394, | |
| "rewards/accuracy_reward": 0.9081945419311523, | |
| "rewards/format_reward": 1.0, | |
| "step": 600 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.6796875, | |
| "epoch": 6.908045977011494, | |
| "grad_norm": 1.9132885179118997, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.091954022988506e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8925232887268066, | |
| "reward_std": 0.020483635365962982, | |
| "rewards/accuracy_reward": 0.8931743502616882, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 601 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.28841400146484, | |
| "epoch": 6.919540229885057, | |
| "grad_norm": 2.831161118788208, | |
| "kl": 0.04541015625, | |
| "learning_rate": 3.0804597701149426e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8872265815734863, | |
| "reward_std": 0.01937289349734783, | |
| "rewards/accuracy_reward": 0.8872265815734863, | |
| "rewards/format_reward": 1.0, | |
| "step": 602 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.193359375, | |
| "epoch": 6.931034482758621, | |
| "grad_norm": 1.9008533326371801, | |
| "kl": 0.046875, | |
| "learning_rate": 3.068965517241379e-07, | |
| "loss": 0.002, | |
| "reward": 1.9161338806152344, | |
| "reward_std": 0.017956051975488663, | |
| "rewards/accuracy_reward": 0.9161338806152344, | |
| "rewards/format_reward": 1.0, | |
| "step": 603 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.76432800292969, | |
| "epoch": 6.942528735632184, | |
| "grad_norm": 1.9049955094649353, | |
| "kl": 0.04541015625, | |
| "learning_rate": 3.057471264367816e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8926061391830444, | |
| "reward_std": 0.019360072910785675, | |
| "rewards/accuracy_reward": 0.8932573199272156, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 604 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.96745300292969, | |
| "epoch": 6.954022988505747, | |
| "grad_norm": 1.275994764733278, | |
| "kl": 0.0458984375, | |
| "learning_rate": 3.045977011494253e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9014930725097656, | |
| "reward_std": 0.017896311357617378, | |
| "rewards/accuracy_reward": 0.9014931917190552, | |
| "rewards/format_reward": 1.0, | |
| "step": 605 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.83528900146484, | |
| "epoch": 6.9655172413793105, | |
| "grad_norm": 1.6892176163979111, | |
| "kl": 0.044677734375, | |
| "learning_rate": 3.0344827586206897e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8952045440673828, | |
| "reward_std": 0.01996638998389244, | |
| "rewards/accuracy_reward": 0.895204484462738, | |
| "rewards/format_reward": 1.0, | |
| "step": 606 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.58659362792969, | |
| "epoch": 6.977011494252873, | |
| "grad_norm": 1.3212776547555032, | |
| "kl": 0.04541015625, | |
| "learning_rate": 3.022988505747126e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9071601629257202, | |
| "reward_std": 0.020716873928904533, | |
| "rewards/accuracy_reward": 0.907811164855957, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 607 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.09505462646484, | |
| "epoch": 6.988505747126437, | |
| "grad_norm": 1.3698382830038993, | |
| "kl": 0.046142578125, | |
| "learning_rate": 3.011494252873563e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8822848796844482, | |
| "reward_std": 0.02124325931072235, | |
| "rewards/accuracy_reward": 0.8822848200798035, | |
| "rewards/format_reward": 1.0, | |
| "step": 608 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.09129333496094, | |
| "epoch": 7.0, | |
| "grad_norm": 1.611157838283439, | |
| "kl": 0.04150390625, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8909047842025757, | |
| "reward_std": 0.01773572526872158, | |
| "rewards/accuracy_reward": 0.8909049034118652, | |
| "rewards/format_reward": 1.0, | |
| "step": 609 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.59440612792969, | |
| "epoch": 7.011494252873563, | |
| "grad_norm": 2.0128308488525946, | |
| "kl": 0.04345703125, | |
| "learning_rate": 2.988505747126437e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8733458518981934, | |
| "reward_std": 0.021298928186297417, | |
| "rewards/accuracy_reward": 0.873996913433075, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 610 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.76628112792969, | |
| "epoch": 7.022988505747127, | |
| "grad_norm": 1.7741703706980398, | |
| "kl": 0.04296875, | |
| "learning_rate": 2.9770114942528737e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8772882223129272, | |
| "reward_std": 0.02053675800561905, | |
| "rewards/accuracy_reward": 0.8772882223129272, | |
| "rewards/format_reward": 1.0, | |
| "step": 611 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.43229675292969, | |
| "epoch": 7.0344827586206895, | |
| "grad_norm": 2.61099721986832, | |
| "kl": 0.042724609375, | |
| "learning_rate": 2.96551724137931e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8760509490966797, | |
| "reward_std": 0.023378584533929825, | |
| "rewards/accuracy_reward": 0.8767021894454956, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 612 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.71745300292969, | |
| "epoch": 7.045977011494253, | |
| "grad_norm": 1.681876205953948, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2.954022988505747e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9030117988586426, | |
| "reward_std": 0.019194534048438072, | |
| "rewards/accuracy_reward": 0.9030117988586426, | |
| "rewards/format_reward": 1.0, | |
| "step": 613 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.92253112792969, | |
| "epoch": 7.057471264367816, | |
| "grad_norm": 1.401507959053581, | |
| "kl": 0.042236328125, | |
| "learning_rate": 2.942528735632184e-07, | |
| "loss": 0.0018, | |
| "reward": 1.886817216873169, | |
| "reward_std": 0.0201483853161335, | |
| "rewards/accuracy_reward": 0.8868171572685242, | |
| "rewards/format_reward": 1.0, | |
| "step": 614 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.21224212646484, | |
| "epoch": 7.068965517241379, | |
| "grad_norm": 2.3290159563411725, | |
| "kl": 0.046142578125, | |
| "learning_rate": 2.93103448275862e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9030017852783203, | |
| "reward_std": 0.021339600905776024, | |
| "rewards/accuracy_reward": 0.9036529064178467, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 615 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.49089050292969, | |
| "epoch": 7.080459770114943, | |
| "grad_norm": 2.406758547267441, | |
| "kl": 0.044921875, | |
| "learning_rate": 2.9195402298850576e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8917362689971924, | |
| "reward_std": 0.02221861481666565, | |
| "rewards/accuracy_reward": 0.892387330532074, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 616 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.44921875, | |
| "epoch": 7.091954022988506, | |
| "grad_norm": 2.4685783634645224, | |
| "kl": 0.047607421875, | |
| "learning_rate": 2.908045977011494e-07, | |
| "loss": 0.002, | |
| "reward": 1.8855441808700562, | |
| "reward_std": 0.022871162742376328, | |
| "rewards/accuracy_reward": 0.8855441808700562, | |
| "rewards/format_reward": 1.0, | |
| "step": 617 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.88021087646484, | |
| "epoch": 7.103448275862069, | |
| "grad_norm": 6.626736993198597, | |
| "kl": 0.047119140625, | |
| "learning_rate": 2.896551724137931e-07, | |
| "loss": 0.002, | |
| "reward": 1.8934516906738281, | |
| "reward_std": 0.021955883130431175, | |
| "rewards/accuracy_reward": 0.8934516906738281, | |
| "rewards/format_reward": 1.0, | |
| "step": 618 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.73698425292969, | |
| "epoch": 7.114942528735632, | |
| "grad_norm": 1.4755499555332456, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2.885057471264368e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8936246633529663, | |
| "reward_std": 0.020434757694602013, | |
| "rewards/accuracy_reward": 0.8936247229576111, | |
| "rewards/format_reward": 1.0, | |
| "step": 619 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.470703125, | |
| "epoch": 7.126436781609195, | |
| "grad_norm": 2.394219832051532, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.873563218390804e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9143403768539429, | |
| "reward_std": 0.020542293787002563, | |
| "rewards/accuracy_reward": 0.9149913787841797, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 620 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.93815612792969, | |
| "epoch": 7.137931034482759, | |
| "grad_norm": 1.5127458134308953, | |
| "kl": 0.042724609375, | |
| "learning_rate": 2.8620689655172416e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8820905685424805, | |
| "reward_std": 0.0194623414427042, | |
| "rewards/accuracy_reward": 0.8827415704727173, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 621 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.171875, | |
| "epoch": 7.149425287356322, | |
| "grad_norm": 1.239872088979501, | |
| "kl": 0.04296875, | |
| "learning_rate": 2.850574712643678e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8986458778381348, | |
| "reward_std": 0.020148858428001404, | |
| "rewards/accuracy_reward": 0.8986459970474243, | |
| "rewards/format_reward": 1.0, | |
| "step": 622 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.97265625, | |
| "epoch": 7.160919540229885, | |
| "grad_norm": 2.458759960422581, | |
| "kl": 0.04248046875, | |
| "learning_rate": 2.8390804597701143e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9095439910888672, | |
| "reward_std": 0.018151775002479553, | |
| "rewards/accuracy_reward": 0.9095439314842224, | |
| "rewards/format_reward": 1.0, | |
| "step": 623 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.96484375, | |
| "epoch": 7.172413793103448, | |
| "grad_norm": 1.7316074752480677, | |
| "kl": 0.0439453125, | |
| "learning_rate": 2.827586206896552e-07, | |
| "loss": 0.0018, | |
| "reward": 1.916879653930664, | |
| "reward_std": 0.016119133681058884, | |
| "rewards/accuracy_reward": 0.9168797731399536, | |
| "rewards/format_reward": 1.0, | |
| "step": 624 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.93685150146484, | |
| "epoch": 7.183908045977011, | |
| "grad_norm": 7.6966596331183545, | |
| "kl": 0.0439453125, | |
| "learning_rate": 2.816091954022988e-07, | |
| "loss": 0.0018, | |
| "reward": 1.904394507408142, | |
| "reward_std": 0.01645834557712078, | |
| "rewards/accuracy_reward": 0.9043946266174316, | |
| "rewards/format_reward": 1.0, | |
| "step": 625 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.34310150146484, | |
| "epoch": 7.195402298850575, | |
| "grad_norm": 3.0027221313342034, | |
| "kl": 0.0419921875, | |
| "learning_rate": 2.8045977011494255e-07, | |
| "loss": 0.0017, | |
| "reward": 1.899554967880249, | |
| "reward_std": 0.022434815764427185, | |
| "rewards/accuracy_reward": 0.9002060294151306, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 626 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.80989837646484, | |
| "epoch": 7.206896551724138, | |
| "grad_norm": 1.6600118557647507, | |
| "kl": 0.0400390625, | |
| "learning_rate": 2.793103448275862e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8966201543807983, | |
| "reward_std": 0.019420992583036423, | |
| "rewards/accuracy_reward": 0.8972713351249695, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 627 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.37174987792969, | |
| "epoch": 7.218390804597701, | |
| "grad_norm": 1.710315303436248, | |
| "kl": 0.046875, | |
| "learning_rate": 2.781609195402299e-07, | |
| "loss": 0.002, | |
| "reward": 1.9059438705444336, | |
| "reward_std": 0.01961967721581459, | |
| "rewards/accuracy_reward": 0.9059439897537231, | |
| "rewards/format_reward": 1.0, | |
| "step": 628 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.951171875, | |
| "epoch": 7.2298850574712645, | |
| "grad_norm": 1.3756891644716789, | |
| "kl": 0.041748046875, | |
| "learning_rate": 2.7701149425287357e-07, | |
| "loss": 0.0017, | |
| "reward": 1.888430118560791, | |
| "reward_std": 0.0190572552382946, | |
| "rewards/accuracy_reward": 0.8890811800956726, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 629 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.404296875, | |
| "epoch": 7.241379310344827, | |
| "grad_norm": 3970437544517841.0, | |
| "kl": 11751030521856.0, | |
| "learning_rate": 2.758620689655172e-07, | |
| "loss": 470864691200.0, | |
| "reward": 1.9054559469223022, | |
| "reward_std": 0.020271051675081253, | |
| "rewards/accuracy_reward": 0.9067580103874207, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 630 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.57747650146484, | |
| "epoch": 7.252873563218391, | |
| "grad_norm": 1.8817887294402726, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.747126436781609e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8884859085083008, | |
| "reward_std": 0.018849171698093414, | |
| "rewards/accuracy_reward": 0.8884860277175903, | |
| "rewards/format_reward": 1.0, | |
| "step": 631 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.96875, | |
| "epoch": 7.264367816091954, | |
| "grad_norm": 1.7719505691506174, | |
| "kl": 0.046142578125, | |
| "learning_rate": 2.735632183908046e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9029910564422607, | |
| "reward_std": 0.021298250183463097, | |
| "rewards/accuracy_reward": 0.9036421179771423, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 632 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.53841400146484, | |
| "epoch": 7.275862068965517, | |
| "grad_norm": 2.449085093736059, | |
| "kl": 0.05078125, | |
| "learning_rate": 2.724137931034483e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8815187215805054, | |
| "reward_std": 0.018999949097633362, | |
| "rewards/accuracy_reward": 0.8815188407897949, | |
| "rewards/format_reward": 1.0, | |
| "step": 633 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.56640625, | |
| "epoch": 7.287356321839081, | |
| "grad_norm": 2.98961076193299, | |
| "kl": 0.044677734375, | |
| "learning_rate": 2.7126436781609197e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9084595441818237, | |
| "reward_std": 0.0186910443007946, | |
| "rewards/accuracy_reward": 0.9084595441818237, | |
| "rewards/format_reward": 1.0, | |
| "step": 634 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.57357025146484, | |
| "epoch": 7.2988505747126435, | |
| "grad_norm": 2.134961142900107, | |
| "kl": 0.04736328125, | |
| "learning_rate": 2.701149425287356e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9063668251037598, | |
| "reward_std": 0.016174225136637688, | |
| "rewards/accuracy_reward": 0.906366765499115, | |
| "rewards/format_reward": 1.0, | |
| "step": 635 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.66276550292969, | |
| "epoch": 7.310344827586207, | |
| "grad_norm": 1.4763178317390981, | |
| "kl": 0.04052734375, | |
| "learning_rate": 2.689655172413793e-07, | |
| "loss": 0.0017, | |
| "reward": 1.898642897605896, | |
| "reward_std": 0.017671559005975723, | |
| "rewards/accuracy_reward": 0.8986430168151855, | |
| "rewards/format_reward": 1.0, | |
| "step": 636 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.041015625, | |
| "epoch": 7.32183908045977, | |
| "grad_norm": 1.6822870021161793, | |
| "kl": 0.048583984375, | |
| "learning_rate": 2.67816091954023e-07, | |
| "loss": 0.002, | |
| "reward": 1.8975701332092285, | |
| "reward_std": 0.018257655203342438, | |
| "rewards/accuracy_reward": 0.8982211947441101, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 637 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.96549987792969, | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 1.4931878670654746, | |
| "kl": 0.044189453125, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9090592861175537, | |
| "reward_std": 0.018635626882314682, | |
| "rewards/accuracy_reward": 0.9090592265129089, | |
| "rewards/format_reward": 1.0, | |
| "step": 638 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.53450775146484, | |
| "epoch": 7.344827586206897, | |
| "grad_norm": 1.96698633932992, | |
| "kl": 0.044677734375, | |
| "learning_rate": 2.655172413793103e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8955191373825073, | |
| "reward_std": 0.01896989531815052, | |
| "rewards/accuracy_reward": 0.8961701393127441, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 639 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.40495300292969, | |
| "epoch": 7.35632183908046, | |
| "grad_norm": 2.0570123873234203, | |
| "kl": 0.0419921875, | |
| "learning_rate": 2.64367816091954e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9056072235107422, | |
| "reward_std": 0.01762087456882, | |
| "rewards/accuracy_reward": 0.9062582850456238, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 640 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.19857025146484, | |
| "epoch": 7.3678160919540225, | |
| "grad_norm": 1.6420815311017691, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2.632183908045977e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8763535022735596, | |
| "reward_std": 0.02036045491695404, | |
| "rewards/accuracy_reward": 0.8763534426689148, | |
| "rewards/format_reward": 1.0, | |
| "step": 641 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.95247650146484, | |
| "epoch": 7.379310344827586, | |
| "grad_norm": 1.68174331233095, | |
| "kl": 0.04638671875, | |
| "learning_rate": 2.620689655172414e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9021470546722412, | |
| "reward_std": 0.02229691669344902, | |
| "rewards/accuracy_reward": 0.9021470546722412, | |
| "rewards/format_reward": 1.0, | |
| "step": 642 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.66732025146484, | |
| "epoch": 7.390804597701149, | |
| "grad_norm": 1.6367021800993964, | |
| "kl": 0.04296875, | |
| "learning_rate": 2.6091954022988507e-07, | |
| "loss": 0.0018, | |
| "reward": 1.904337763786316, | |
| "reward_std": 0.021419089287519455, | |
| "rewards/accuracy_reward": 0.9056398868560791, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 643 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.353515625, | |
| "epoch": 7.402298850574713, | |
| "grad_norm": 1.7586162669562002, | |
| "kl": 0.047607421875, | |
| "learning_rate": 2.597701149425287e-07, | |
| "loss": 0.002, | |
| "reward": 1.8953742980957031, | |
| "reward_std": 0.01957518607378006, | |
| "rewards/accuracy_reward": 0.8960254192352295, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 644 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.19010925292969, | |
| "epoch": 7.413793103448276, | |
| "grad_norm": 2.5073706132424447, | |
| "kl": 0.044921875, | |
| "learning_rate": 2.586206896551724e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8890656232833862, | |
| "reward_std": 0.02112973853945732, | |
| "rewards/accuracy_reward": 0.8903676867485046, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 645 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.71484375, | |
| "epoch": 7.425287356321839, | |
| "grad_norm": 2.802792541775325, | |
| "kl": 0.045166015625, | |
| "learning_rate": 2.574712643678161e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8906625509262085, | |
| "reward_std": 0.021000966429710388, | |
| "rewards/accuracy_reward": 0.8906625509262085, | |
| "rewards/format_reward": 1.0, | |
| "step": 646 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.3203125, | |
| "epoch": 7.436781609195402, | |
| "grad_norm": 1.531808532330182, | |
| "kl": 0.04296875, | |
| "learning_rate": 2.563218390804597e-07, | |
| "loss": 0.0018, | |
| "reward": 1.902920126914978, | |
| "reward_std": 0.018811281770467758, | |
| "rewards/accuracy_reward": 0.902920126914978, | |
| "rewards/format_reward": 1.0, | |
| "step": 647 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.66796875, | |
| "epoch": 7.448275862068965, | |
| "grad_norm": 2.9559025913941355, | |
| "kl": 0.0439453125, | |
| "learning_rate": 2.5517241379310346e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8963336944580078, | |
| "reward_std": 0.020042497664690018, | |
| "rewards/accuracy_reward": 0.8969849348068237, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 648 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.14909362792969, | |
| "epoch": 7.459770114942529, | |
| "grad_norm": 1.9565147315824727, | |
| "kl": 0.04736328125, | |
| "learning_rate": 2.540229885057471e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9087027311325073, | |
| "reward_std": 0.018100788816809654, | |
| "rewards/accuracy_reward": 0.9087027311325073, | |
| "rewards/format_reward": 1.0, | |
| "step": 649 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.951171875, | |
| "epoch": 7.471264367816092, | |
| "grad_norm": 3.147083124745903, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.5287356321839084e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9156570434570312, | |
| "reward_std": 0.01904350332915783, | |
| "rewards/accuracy_reward": 0.9156570434570312, | |
| "rewards/format_reward": 1.0, | |
| "step": 650 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.552734375, | |
| "epoch": 7.482758620689655, | |
| "grad_norm": 1.6343407556749157, | |
| "kl": 0.051513671875, | |
| "learning_rate": 2.517241379310345e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9167011976242065, | |
| "reward_std": 0.01607966236770153, | |
| "rewards/accuracy_reward": 0.9167011976242065, | |
| "rewards/format_reward": 1.0, | |
| "step": 651 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.46940612792969, | |
| "epoch": 7.494252873563219, | |
| "grad_norm": 2.380370478966516, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.505747126436781e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8996713161468506, | |
| "reward_std": 0.018466008827090263, | |
| "rewards/accuracy_reward": 0.9003223776817322, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 652 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.76692962646484, | |
| "epoch": 7.505747126436781, | |
| "grad_norm": 1.572354097669339, | |
| "kl": 0.049560546875, | |
| "learning_rate": 2.494252873563218e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8812230825424194, | |
| "reward_std": 0.020288746803998947, | |
| "rewards/accuracy_reward": 0.8818740844726562, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 653 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.83984375, | |
| "epoch": 7.517241379310345, | |
| "grad_norm": 3.4037020500640507, | |
| "kl": 0.057373046875, | |
| "learning_rate": 2.482758620689655e-07, | |
| "loss": 0.0024, | |
| "reward": 1.9109630584716797, | |
| "reward_std": 0.01749694161117077, | |
| "rewards/accuracy_reward": 0.9109630584716797, | |
| "rewards/format_reward": 1.0, | |
| "step": 654 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.07292175292969, | |
| "epoch": 7.528735632183908, | |
| "grad_norm": 1.4740870320089872, | |
| "kl": 0.049072265625, | |
| "learning_rate": 2.471264367816092e-07, | |
| "loss": 0.002, | |
| "reward": 1.9072259664535522, | |
| "reward_std": 0.016256902366876602, | |
| "rewards/accuracy_reward": 0.907226026058197, | |
| "rewards/format_reward": 1.0, | |
| "step": 655 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.85612487792969, | |
| "epoch": 7.540229885057471, | |
| "grad_norm": 2.126019785623936, | |
| "kl": 0.04931640625, | |
| "learning_rate": 2.459770114942529e-07, | |
| "loss": 0.002, | |
| "reward": 1.8846591711044312, | |
| "reward_std": 0.018113628029823303, | |
| "rewards/accuracy_reward": 0.8846592903137207, | |
| "rewards/format_reward": 1.0, | |
| "step": 656 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.55794525146484, | |
| "epoch": 7.551724137931035, | |
| "grad_norm": 7.89243748687455, | |
| "kl": 0.047607421875, | |
| "learning_rate": 2.448275862068965e-07, | |
| "loss": 0.002, | |
| "reward": 1.8965566158294678, | |
| "reward_std": 0.019315050914883614, | |
| "rewards/accuracy_reward": 0.896556556224823, | |
| "rewards/format_reward": 1.0, | |
| "step": 657 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.54817962646484, | |
| "epoch": 7.563218390804598, | |
| "grad_norm": 3.4267861157909403, | |
| "kl": 0.046142578125, | |
| "learning_rate": 2.436781609195402e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8947169780731201, | |
| "reward_std": 0.019318781793117523, | |
| "rewards/accuracy_reward": 0.8947169184684753, | |
| "rewards/format_reward": 1.0, | |
| "step": 658 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.568359375, | |
| "epoch": 7.574712643678161, | |
| "grad_norm": 1.6852921029556645, | |
| "kl": 0.052490234375, | |
| "learning_rate": 2.425287356321839e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9210675954818726, | |
| "reward_std": 0.015680838376283646, | |
| "rewards/accuracy_reward": 0.9210675954818726, | |
| "rewards/format_reward": 1.0, | |
| "step": 659 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.3046875, | |
| "epoch": 7.586206896551724, | |
| "grad_norm": 2.206767755219908, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.413793103448276e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8979721069335938, | |
| "reward_std": 0.016466915607452393, | |
| "rewards/accuracy_reward": 0.8979721069335938, | |
| "rewards/format_reward": 1.0, | |
| "step": 660 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.42643737792969, | |
| "epoch": 7.597701149425287, | |
| "grad_norm": 2.116528759364446, | |
| "kl": 0.043701171875, | |
| "learning_rate": 2.402298850574712e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9062350988388062, | |
| "reward_std": 0.015808025375008583, | |
| "rewards/accuracy_reward": 0.9062352180480957, | |
| "rewards/format_reward": 1.0, | |
| "step": 661 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.13021087646484, | |
| "epoch": 7.609195402298851, | |
| "grad_norm": 2.3658556212140773, | |
| "kl": 0.04736328125, | |
| "learning_rate": 2.390804597701149e-07, | |
| "loss": 0.002, | |
| "reward": 1.9090293645858765, | |
| "reward_std": 0.020903117954730988, | |
| "rewards/accuracy_reward": 0.9096805453300476, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 662 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.53060150146484, | |
| "epoch": 7.620689655172414, | |
| "grad_norm": 1.736164910774429, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.3793103448275863e-07, | |
| "loss": 0.0018, | |
| "reward": 1.892618179321289, | |
| "reward_std": 0.020376306027173996, | |
| "rewards/accuracy_reward": 0.8932693004608154, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 663 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.14192962646484, | |
| "epoch": 7.6321839080459775, | |
| "grad_norm": 1.661775706838517, | |
| "kl": 0.04345703125, | |
| "learning_rate": 2.367816091954023e-07, | |
| "loss": 0.0018, | |
| "reward": 1.910132884979248, | |
| "reward_std": 0.01860654354095459, | |
| "rewards/accuracy_reward": 0.9107840061187744, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 664 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.82878112792969, | |
| "epoch": 7.64367816091954, | |
| "grad_norm": 1.775794567645862, | |
| "kl": 0.047607421875, | |
| "learning_rate": 2.3563218390804595e-07, | |
| "loss": 0.002, | |
| "reward": 1.8953571319580078, | |
| "reward_std": 0.019160928204655647, | |
| "rewards/accuracy_reward": 0.8953571319580078, | |
| "rewards/format_reward": 1.0, | |
| "step": 665 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.8515625, | |
| "epoch": 7.655172413793103, | |
| "grad_norm": 2.852042326779303, | |
| "kl": 0.05224609375, | |
| "learning_rate": 2.3448275862068964e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9160139560699463, | |
| "reward_std": 0.01864560693502426, | |
| "rewards/accuracy_reward": 0.9160139560699463, | |
| "rewards/format_reward": 1.0, | |
| "step": 666 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.22135925292969, | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 2.00298655553839, | |
| "kl": 0.046142578125, | |
| "learning_rate": 2.3333333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8910598754882812, | |
| "reward_std": 0.020590651780366898, | |
| "rewards/accuracy_reward": 0.8910599946975708, | |
| "rewards/format_reward": 1.0, | |
| "step": 667 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.58268737792969, | |
| "epoch": 7.67816091954023, | |
| "grad_norm": 2.2015919771538255, | |
| "kl": 0.044189453125, | |
| "learning_rate": 2.3218390804597702e-07, | |
| "loss": 0.0018, | |
| "reward": 1.897012710571289, | |
| "reward_std": 0.02280309796333313, | |
| "rewards/accuracy_reward": 0.8983148336410522, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 668 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.41471862792969, | |
| "epoch": 7.689655172413794, | |
| "grad_norm": 4.620691107539993, | |
| "kl": 0.048828125, | |
| "learning_rate": 2.3103448275862066e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9026827812194824, | |
| "reward_std": 0.018936630338430405, | |
| "rewards/accuracy_reward": 0.903333842754364, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 669 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.208984375, | |
| "epoch": 7.7011494252873565, | |
| "grad_norm": 1.5207850843287138, | |
| "kl": 0.052001953125, | |
| "learning_rate": 2.2988505747126435e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8975833654403687, | |
| "reward_std": 0.01803259551525116, | |
| "rewards/accuracy_reward": 0.8975834846496582, | |
| "rewards/format_reward": 1.0, | |
| "step": 670 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.48698425292969, | |
| "epoch": 7.712643678160919, | |
| "grad_norm": 1.1439576464320695, | |
| "kl": 0.04248046875, | |
| "learning_rate": 2.2873563218390804e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8865493535995483, | |
| "reward_std": 0.018641415983438492, | |
| "rewards/accuracy_reward": 0.8865493535995483, | |
| "rewards/format_reward": 1.0, | |
| "step": 671 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.46745300292969, | |
| "epoch": 7.724137931034483, | |
| "grad_norm": 1.5940069065520925, | |
| "kl": 0.0419921875, | |
| "learning_rate": 2.2758620689655173e-07, | |
| "loss": 0.0017, | |
| "reward": 1.874269723892212, | |
| "reward_std": 0.02033749222755432, | |
| "rewards/accuracy_reward": 0.8742696642875671, | |
| "rewards/format_reward": 1.0, | |
| "step": 672 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.087890625, | |
| "epoch": 7.735632183908046, | |
| "grad_norm": 2.1488145925757234, | |
| "kl": 0.048828125, | |
| "learning_rate": 2.264367816091954e-07, | |
| "loss": 0.002, | |
| "reward": 1.9152870178222656, | |
| "reward_std": 0.015497724525630474, | |
| "rewards/accuracy_reward": 0.9152869582176208, | |
| "rewards/format_reward": 1.0, | |
| "step": 673 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.20052337646484, | |
| "epoch": 7.747126436781609, | |
| "grad_norm": 1.5334204725557474, | |
| "kl": 0.045654296875, | |
| "learning_rate": 2.2528735632183905e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9050960540771484, | |
| "reward_std": 0.021023746579885483, | |
| "rewards/accuracy_reward": 0.9057471752166748, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 674 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.88671875, | |
| "epoch": 7.758620689655173, | |
| "grad_norm": 1.6306945359970868, | |
| "kl": 0.03955078125, | |
| "learning_rate": 2.2413793103448274e-07, | |
| "loss": 0.0017, | |
| "reward": 1.908623456954956, | |
| "reward_std": 0.017657993361353874, | |
| "rewards/accuracy_reward": 0.9086233973503113, | |
| "rewards/format_reward": 1.0, | |
| "step": 675 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.23567962646484, | |
| "epoch": 7.7701149425287355, | |
| "grad_norm": 1.3620347321088389, | |
| "kl": 0.0400390625, | |
| "learning_rate": 2.2298850574712643e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8560750484466553, | |
| "reward_std": 0.021627038717269897, | |
| "rewards/accuracy_reward": 0.8573770523071289, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 676 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.77278900146484, | |
| "epoch": 7.781609195402299, | |
| "grad_norm": 2.4445187153033547, | |
| "kl": 0.041015625, | |
| "learning_rate": 2.218390804597701e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9019120931625366, | |
| "reward_std": 0.02054600417613983, | |
| "rewards/accuracy_reward": 0.9019122123718262, | |
| "rewards/format_reward": 1.0, | |
| "step": 677 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.171875, | |
| "epoch": 7.793103448275862, | |
| "grad_norm": 2.2620273599837537, | |
| "kl": 0.03759765625, | |
| "learning_rate": 2.206896551724138e-07, | |
| "loss": 0.0016, | |
| "reward": 1.899298071861267, | |
| "reward_std": 0.019102804362773895, | |
| "rewards/accuracy_reward": 0.8992981314659119, | |
| "rewards/format_reward": 1.0, | |
| "step": 678 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.884765625, | |
| "epoch": 7.804597701149425, | |
| "grad_norm": 2.7848035680621446, | |
| "kl": 0.038330078125, | |
| "learning_rate": 2.1954022988505748e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8842277526855469, | |
| "reward_std": 0.015971308574080467, | |
| "rewards/accuracy_reward": 0.8842277526855469, | |
| "rewards/format_reward": 1.0, | |
| "step": 679 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.04232025146484, | |
| "epoch": 7.816091954022989, | |
| "grad_norm": 1.526409335432903, | |
| "kl": 0.04736328125, | |
| "learning_rate": 2.1839080459770114e-07, | |
| "loss": 0.002, | |
| "reward": 1.9061853885650635, | |
| "reward_std": 0.018970629200339317, | |
| "rewards/accuracy_reward": 0.9061853289604187, | |
| "rewards/format_reward": 1.0, | |
| "step": 680 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.986328125, | |
| "epoch": 7.827586206896552, | |
| "grad_norm": 2.4205054636705627, | |
| "kl": 0.04345703125, | |
| "learning_rate": 2.172413793103448e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8783591985702515, | |
| "reward_std": 0.020205635577440262, | |
| "rewards/accuracy_reward": 0.8783591985702515, | |
| "rewards/format_reward": 1.0, | |
| "step": 681 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.24544525146484, | |
| "epoch": 7.8390804597701145, | |
| "grad_norm": 1.6438250497849267, | |
| "kl": 0.0380859375, | |
| "learning_rate": 2.160919540229885e-07, | |
| "loss": 0.0016, | |
| "reward": 1.9045305252075195, | |
| "reward_std": 0.020377201959490776, | |
| "rewards/accuracy_reward": 0.9051817655563354, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 682 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.76171875, | |
| "epoch": 7.850574712643678, | |
| "grad_norm": 1.2996080329358168, | |
| "kl": 0.041748046875, | |
| "learning_rate": 2.1494252873563218e-07, | |
| "loss": 0.0017, | |
| "reward": 1.907975435256958, | |
| "reward_std": 0.017765961587429047, | |
| "rewards/accuracy_reward": 0.9079753756523132, | |
| "rewards/format_reward": 1.0, | |
| "step": 683 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.03190612792969, | |
| "epoch": 7.862068965517241, | |
| "grad_norm": 5.191492192434162, | |
| "kl": 0.0400390625, | |
| "learning_rate": 2.1379310344827587e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9033807516098022, | |
| "reward_std": 0.019389839842915535, | |
| "rewards/accuracy_reward": 0.9033806324005127, | |
| "rewards/format_reward": 1.0, | |
| "step": 684 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.60026550292969, | |
| "epoch": 7.873563218390805, | |
| "grad_norm": 2.4150479297071725, | |
| "kl": 0.04248046875, | |
| "learning_rate": 2.126436781609195e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9296969175338745, | |
| "reward_std": 0.01699325256049633, | |
| "rewards/accuracy_reward": 0.9296969175338745, | |
| "rewards/format_reward": 1.0, | |
| "step": 685 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.80078125, | |
| "epoch": 7.885057471264368, | |
| "grad_norm": 1.3996427566274907, | |
| "kl": 0.041015625, | |
| "learning_rate": 2.114942528735632e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9054107666015625, | |
| "reward_std": 0.01808765158057213, | |
| "rewards/accuracy_reward": 0.905410885810852, | |
| "rewards/format_reward": 1.0, | |
| "step": 686 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.173828125, | |
| "epoch": 7.896551724137931, | |
| "grad_norm": 1.4978085636014402, | |
| "kl": 0.041259765625, | |
| "learning_rate": 2.103448275862069e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9016507863998413, | |
| "reward_std": 0.018792547285556793, | |
| "rewards/accuracy_reward": 0.9016507863998413, | |
| "rewards/format_reward": 1.0, | |
| "step": 687 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.52995300292969, | |
| "epoch": 7.908045977011494, | |
| "grad_norm": 3.073796889740199, | |
| "kl": 0.03857421875, | |
| "learning_rate": 2.0919540229885058e-07, | |
| "loss": 0.0016, | |
| "reward": 1.913766860961914, | |
| "reward_std": 0.01984010636806488, | |
| "rewards/accuracy_reward": 0.9144179224967957, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 688 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.84049987792969, | |
| "epoch": 7.919540229885057, | |
| "grad_norm": 1.7046597310271008, | |
| "kl": 0.039306640625, | |
| "learning_rate": 2.0804597701149424e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8879507780075073, | |
| "reward_std": 0.02212076261639595, | |
| "rewards/accuracy_reward": 0.8892530202865601, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 689 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.87435150146484, | |
| "epoch": 7.931034482758621, | |
| "grad_norm": 2.469859985195517, | |
| "kl": 0.044921875, | |
| "learning_rate": 2.0689655172413793e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9116500616073608, | |
| "reward_std": 0.021436279639601707, | |
| "rewards/accuracy_reward": 0.9129521250724792, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 690 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.86979675292969, | |
| "epoch": 7.942528735632184, | |
| "grad_norm": 1.7410487864129292, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2.057471264367816e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9140040874481201, | |
| "reward_std": 0.018954966217279434, | |
| "rewards/accuracy_reward": 0.9140040278434753, | |
| "rewards/format_reward": 1.0, | |
| "step": 691 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.541015625, | |
| "epoch": 7.954022988505747, | |
| "grad_norm": 1.7493728678791296, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.0459770114942528e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9026793241500854, | |
| "reward_std": 0.015167943201959133, | |
| "rewards/accuracy_reward": 0.9026793241500854, | |
| "rewards/format_reward": 1.0, | |
| "step": 692 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.72982025146484, | |
| "epoch": 7.9655172413793105, | |
| "grad_norm": 1.7859158502563095, | |
| "kl": 0.043701171875, | |
| "learning_rate": 2.0344827586206895e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8844761848449707, | |
| "reward_std": 0.021765243262052536, | |
| "rewards/accuracy_reward": 0.8844761848449707, | |
| "rewards/format_reward": 1.0, | |
| "step": 693 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.84440612792969, | |
| "epoch": 7.977011494252873, | |
| "grad_norm": 1.5585940385657546, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.0229885057471264e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8896058797836304, | |
| "reward_std": 0.022624578326940536, | |
| "rewards/accuracy_reward": 0.8902568817138672, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 694 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.83268737792969, | |
| "epoch": 7.988505747126437, | |
| "grad_norm": 4.611642156415871, | |
| "kl": 0.045654296875, | |
| "learning_rate": 2.0114942528735633e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9184387922286987, | |
| "reward_std": 0.01827239617705345, | |
| "rewards/accuracy_reward": 0.9184388518333435, | |
| "rewards/format_reward": 1.0, | |
| "step": 695 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 100.7078628540039, | |
| "epoch": 8.0, | |
| "grad_norm": 3.175089899536824, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0019, | |
| "reward": 1.917198657989502, | |
| "reward_std": 0.017983654513955116, | |
| "rewards/accuracy_reward": 0.9171985983848572, | |
| "rewards/format_reward": 1.0, | |
| "step": 696 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.88671875, | |
| "epoch": 8.011494252873563, | |
| "grad_norm": 1.599375509212271, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.9885057471264365e-07, | |
| "loss": 0.002, | |
| "reward": 1.900770902633667, | |
| "reward_std": 0.020885910838842392, | |
| "rewards/accuracy_reward": 0.9007708430290222, | |
| "rewards/format_reward": 1.0, | |
| "step": 697 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.009765625, | |
| "epoch": 8.022988505747126, | |
| "grad_norm": 1.5228925864244678, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1.9770114942528734e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8776483535766602, | |
| "reward_std": 0.020006125792860985, | |
| "rewards/accuracy_reward": 0.878299355506897, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 698 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.82942962646484, | |
| "epoch": 8.03448275862069, | |
| "grad_norm": 1.8238558955068145, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1.9655172413793103e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9061753749847412, | |
| "reward_std": 0.01833120919764042, | |
| "rewards/accuracy_reward": 0.9061753153800964, | |
| "rewards/format_reward": 1.0, | |
| "step": 699 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.52864837646484, | |
| "epoch": 8.045977011494253, | |
| "grad_norm": 1.3341303550118901, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.9540229885057472e-07, | |
| "loss": 0.002, | |
| "reward": 1.8628864288330078, | |
| "reward_std": 0.02142808958888054, | |
| "rewards/accuracy_reward": 0.8628865480422974, | |
| "rewards/format_reward": 1.0, | |
| "step": 700 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.52799987792969, | |
| "epoch": 8.057471264367816, | |
| "grad_norm": 1.761222226586845, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1.9425287356321836e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8934277296066284, | |
| "reward_std": 0.0226028673350811, | |
| "rewards/accuracy_reward": 0.8940789103507996, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 701 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.29948425292969, | |
| "epoch": 8.068965517241379, | |
| "grad_norm": 1.5102446080782972, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1.9310344827586205e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8807815313339233, | |
| "reward_std": 0.023064211010932922, | |
| "rewards/accuracy_reward": 0.8814325332641602, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 702 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.68099212646484, | |
| "epoch": 8.080459770114942, | |
| "grad_norm": 2.6463573417228234, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1.9195402298850574e-07, | |
| "loss": 0.0019, | |
| "reward": 1.91706120967865, | |
| "reward_std": 0.018780669197440147, | |
| "rewards/accuracy_reward": 0.9170613288879395, | |
| "rewards/format_reward": 1.0, | |
| "step": 703 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.86653900146484, | |
| "epoch": 8.091954022988507, | |
| "grad_norm": 1.9782556380033625, | |
| "kl": 0.046875, | |
| "learning_rate": 1.9080459770114943e-07, | |
| "loss": 0.002, | |
| "reward": 1.9006422758102417, | |
| "reward_std": 0.02227187156677246, | |
| "rewards/accuracy_reward": 0.9012932777404785, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 704 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.27799987792969, | |
| "epoch": 8.10344827586207, | |
| "grad_norm": 1.571198957982024, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1.896551724137931e-07, | |
| "loss": 0.0018, | |
| "reward": 1.901580810546875, | |
| "reward_std": 0.018761413171887398, | |
| "rewards/accuracy_reward": 0.901580810546875, | |
| "rewards/format_reward": 1.0, | |
| "step": 705 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.69140625, | |
| "epoch": 8.114942528735632, | |
| "grad_norm": 2.093159658605368, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.8850574712643678e-07, | |
| "loss": 0.0019, | |
| "reward": 1.891250491142273, | |
| "reward_std": 0.019944649189710617, | |
| "rewards/accuracy_reward": 0.8912505507469177, | |
| "rewards/format_reward": 1.0, | |
| "step": 706 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.19921875, | |
| "epoch": 8.126436781609195, | |
| "grad_norm": 2.529542496168213, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1.8735632183908045e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8948949575424194, | |
| "reward_std": 0.020839963108301163, | |
| "rewards/accuracy_reward": 0.8948950171470642, | |
| "rewards/format_reward": 1.0, | |
| "step": 707 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.47721862792969, | |
| "epoch": 8.137931034482758, | |
| "grad_norm": 2.451670338869928, | |
| "kl": 0.05419921875, | |
| "learning_rate": 1.8620689655172414e-07, | |
| "loss": 0.0023, | |
| "reward": 1.8975166082382202, | |
| "reward_std": 0.02222670242190361, | |
| "rewards/accuracy_reward": 0.898167610168457, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 708 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.72005462646484, | |
| "epoch": 8.149425287356323, | |
| "grad_norm": 2.6109529506496525, | |
| "kl": 0.04150390625, | |
| "learning_rate": 1.850574712643678e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8816875219345093, | |
| "reward_std": 0.021295679733157158, | |
| "rewards/accuracy_reward": 0.881687343120575, | |
| "rewards/format_reward": 1.0, | |
| "step": 709 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.10482025146484, | |
| "epoch": 8.160919540229886, | |
| "grad_norm": 2.236575205027573, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.839080459770115e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8859424591064453, | |
| "reward_std": 0.01964872144162655, | |
| "rewards/accuracy_reward": 0.8859424591064453, | |
| "rewards/format_reward": 1.0, | |
| "step": 710 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.97721862792969, | |
| "epoch": 8.172413793103448, | |
| "grad_norm": 2.981879850123249, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.8275862068965518e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9109057188034058, | |
| "reward_std": 0.01663116365671158, | |
| "rewards/accuracy_reward": 0.9109057188034058, | |
| "rewards/format_reward": 1.0, | |
| "step": 711 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.57487487792969, | |
| "epoch": 8.183908045977011, | |
| "grad_norm": 2.3188147233415277, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1.8160919540229884e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8978303670883179, | |
| "reward_std": 0.021748527884483337, | |
| "rewards/accuracy_reward": 0.8978304266929626, | |
| "rewards/format_reward": 1.0, | |
| "step": 712 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.91927337646484, | |
| "epoch": 8.195402298850574, | |
| "grad_norm": 3.225354097129705, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1.804597701149425e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8894078731536865, | |
| "reward_std": 0.018167873844504356, | |
| "rewards/accuracy_reward": 0.8894079923629761, | |
| "rewards/format_reward": 1.0, | |
| "step": 713 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.33203125, | |
| "epoch": 8.206896551724139, | |
| "grad_norm": 2.1412928496025216, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1.793103448275862e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9061758518218994, | |
| "reward_std": 0.019810751080513, | |
| "rewards/accuracy_reward": 0.9061757922172546, | |
| "rewards/format_reward": 1.0, | |
| "step": 714 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.83919525146484, | |
| "epoch": 8.218390804597702, | |
| "grad_norm": 1.4510305932233432, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.7816091954022988e-07, | |
| "loss": 0.002, | |
| "reward": 1.8892874717712402, | |
| "reward_std": 0.01888991892337799, | |
| "rewards/accuracy_reward": 0.8892874717712402, | |
| "rewards/format_reward": 1.0, | |
| "step": 715 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.51302337646484, | |
| "epoch": 8.229885057471265, | |
| "grad_norm": 2.2029044678618663, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1.7701149425287357e-07, | |
| "loss": 0.0022, | |
| "reward": 1.885032057762146, | |
| "reward_std": 0.01888071931898594, | |
| "rewards/accuracy_reward": 0.8850321173667908, | |
| "rewards/format_reward": 1.0, | |
| "step": 716 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.80534362792969, | |
| "epoch": 8.241379310344827, | |
| "grad_norm": 2.3323680467220145, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.758620689655172e-07, | |
| "loss": 0.002, | |
| "reward": 1.9214333295822144, | |
| "reward_std": 0.021036818623542786, | |
| "rewards/accuracy_reward": 0.9220844507217407, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 717 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.37435150146484, | |
| "epoch": 8.25287356321839, | |
| "grad_norm": 2.8730245319899317, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.747126436781609e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8911713361740112, | |
| "reward_std": 0.019480139017105103, | |
| "rewards/accuracy_reward": 0.892473578453064, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 718 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.451171875, | |
| "epoch": 8.264367816091955, | |
| "grad_norm": 1.9234276428720276, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.735632183908046e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9077482223510742, | |
| "reward_std": 0.01729690097272396, | |
| "rewards/accuracy_reward": 0.9077481627464294, | |
| "rewards/format_reward": 1.0, | |
| "step": 719 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.9140625, | |
| "epoch": 8.275862068965518, | |
| "grad_norm": 1.7140585268322626, | |
| "kl": 0.05126953125, | |
| "learning_rate": 1.7241379310344828e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9137029647827148, | |
| "reward_std": 0.018116027116775513, | |
| "rewards/accuracy_reward": 0.9137029647827148, | |
| "rewards/format_reward": 1.0, | |
| "step": 720 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.93034362792969, | |
| "epoch": 8.28735632183908, | |
| "grad_norm": 2.219230959644551, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.7126436781609194e-07, | |
| "loss": 0.0019, | |
| "reward": 1.894079566001892, | |
| "reward_std": 0.02045278623700142, | |
| "rewards/accuracy_reward": 0.8940793871879578, | |
| "rewards/format_reward": 1.0, | |
| "step": 721 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.455078125, | |
| "epoch": 8.298850574712644, | |
| "grad_norm": 2.5331188317191087, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.7011494252873563e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9165410995483398, | |
| "reward_std": 0.017832912504673004, | |
| "rewards/accuracy_reward": 0.9165410995483398, | |
| "rewards/format_reward": 1.0, | |
| "step": 722 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.32357025146484, | |
| "epoch": 8.310344827586206, | |
| "grad_norm": 4.02373195859424, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.689655172413793e-07, | |
| "loss": 0.002, | |
| "reward": 1.894669532775879, | |
| "reward_std": 0.020446766167879105, | |
| "rewards/accuracy_reward": 0.8946696519851685, | |
| "rewards/format_reward": 1.0, | |
| "step": 723 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.5, | |
| "epoch": 8.32183908045977, | |
| "grad_norm": 1.8288623944358686, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.67816091954023e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9076260328292847, | |
| "reward_std": 0.019866902381181717, | |
| "rewards/accuracy_reward": 0.9082770347595215, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 724 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.72786712646484, | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 5.4298852737123635, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 0.002, | |
| "reward": 1.9031809568405151, | |
| "reward_std": 0.02082451805472374, | |
| "rewards/accuracy_reward": 0.903831958770752, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 725 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.61458587646484, | |
| "epoch": 8.344827586206897, | |
| "grad_norm": 2.2093808314572296, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.6551724137931034e-07, | |
| "loss": 0.002, | |
| "reward": 1.8967069387435913, | |
| "reward_std": 0.018487486988306046, | |
| "rewards/accuracy_reward": 0.8967069983482361, | |
| "rewards/format_reward": 1.0, | |
| "step": 726 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.33268737792969, | |
| "epoch": 8.35632183908046, | |
| "grad_norm": 1.6410241094216826, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.6436781609195403e-07, | |
| "loss": 0.002, | |
| "reward": 1.9131641387939453, | |
| "reward_std": 0.016289152204990387, | |
| "rewards/accuracy_reward": 0.9131642580032349, | |
| "rewards/format_reward": 1.0, | |
| "step": 727 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.259765625, | |
| "epoch": 8.367816091954023, | |
| "grad_norm": 1.7686301784709073, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.632183908045977e-07, | |
| "loss": 0.002, | |
| "reward": 1.9046128988265991, | |
| "reward_std": 0.0201382078230381, | |
| "rewards/accuracy_reward": 0.9046128988265991, | |
| "rewards/format_reward": 1.0, | |
| "step": 728 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.443359375, | |
| "epoch": 8.379310344827585, | |
| "grad_norm": 2.16179386604187, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1.6206896551724136e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8986444473266602, | |
| "reward_std": 0.01659722626209259, | |
| "rewards/accuracy_reward": 0.8986444473266602, | |
| "rewards/format_reward": 1.0, | |
| "step": 729 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.22721862792969, | |
| "epoch": 8.39080459770115, | |
| "grad_norm": 2.8856095934176857, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.6091954022988505e-07, | |
| "loss": 0.002, | |
| "reward": 1.9096076488494873, | |
| "reward_std": 0.018815144896507263, | |
| "rewards/accuracy_reward": 0.9096077680587769, | |
| "rewards/format_reward": 1.0, | |
| "step": 730 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.619140625, | |
| "epoch": 8.402298850574713, | |
| "grad_norm": 2.9006809475096658, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.5977011494252874e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9206702709197998, | |
| "reward_std": 0.01633971370756626, | |
| "rewards/accuracy_reward": 0.920670211315155, | |
| "rewards/format_reward": 1.0, | |
| "step": 731 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.259765625, | |
| "epoch": 8.413793103448276, | |
| "grad_norm": 1.9751932337603062, | |
| "kl": 0.052734375, | |
| "learning_rate": 1.5862068965517243e-07, | |
| "loss": 0.0022, | |
| "reward": 1.893907070159912, | |
| "reward_std": 0.02224368415772915, | |
| "rewards/accuracy_reward": 0.8939072489738464, | |
| "rewards/format_reward": 1.0, | |
| "step": 732 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.00846862792969, | |
| "epoch": 8.425287356321839, | |
| "grad_norm": 2.0957540123306364, | |
| "kl": 0.0556640625, | |
| "learning_rate": 1.5747126436781606e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9105441570281982, | |
| "reward_std": 0.01823435351252556, | |
| "rewards/accuracy_reward": 0.9105440974235535, | |
| "rewards/format_reward": 1.0, | |
| "step": 733 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.10872650146484, | |
| "epoch": 8.436781609195402, | |
| "grad_norm": 1.7574530073426011, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.5632183908045975e-07, | |
| "loss": 0.002, | |
| "reward": 1.9063684940338135, | |
| "reward_std": 0.017554182559251785, | |
| "rewards/accuracy_reward": 0.9063684940338135, | |
| "rewards/format_reward": 1.0, | |
| "step": 734 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.40755462646484, | |
| "epoch": 8.448275862068966, | |
| "grad_norm": 1.718104369150053, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.5517241379310344e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9011447429656982, | |
| "reward_std": 0.02049892395734787, | |
| "rewards/accuracy_reward": 0.9017957448959351, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 735 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.095703125, | |
| "epoch": 8.459770114942529, | |
| "grad_norm": 1.8096241235073485, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.5402298850574713e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9035829305648804, | |
| "reward_std": 0.017755288630723953, | |
| "rewards/accuracy_reward": 0.9035829305648804, | |
| "rewards/format_reward": 1.0, | |
| "step": 736 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.228515625, | |
| "epoch": 8.471264367816092, | |
| "grad_norm": 4.3156887737751966, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.528735632183908e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8951524496078491, | |
| "reward_std": 0.01714376173913479, | |
| "rewards/accuracy_reward": 0.8951523303985596, | |
| "rewards/format_reward": 1.0, | |
| "step": 737 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.240234375, | |
| "epoch": 8.482758620689655, | |
| "grad_norm": 1.8203610430487198, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.5172413793103449e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9012572765350342, | |
| "reward_std": 0.020000826567411423, | |
| "rewards/accuracy_reward": 0.9019083976745605, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 738 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.80078125, | |
| "epoch": 8.494252873563218, | |
| "grad_norm": 2.4493356638370547, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.5057471264367815e-07, | |
| "loss": 0.002, | |
| "reward": 1.8994574546813965, | |
| "reward_std": 0.01900039240717888, | |
| "rewards/accuracy_reward": 0.9001085758209229, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 739 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.68815612792969, | |
| "epoch": 8.505747126436782, | |
| "grad_norm": 1.7961165376254185, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.4942528735632184e-07, | |
| "loss": 0.002, | |
| "reward": 1.8904352188110352, | |
| "reward_std": 0.018353838473558426, | |
| "rewards/accuracy_reward": 0.8904353380203247, | |
| "rewards/format_reward": 1.0, | |
| "step": 740 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.22721862792969, | |
| "epoch": 8.517241379310345, | |
| "grad_norm": 2.124448648242551, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.482758620689655e-07, | |
| "loss": 0.002, | |
| "reward": 1.9015287160873413, | |
| "reward_std": 0.017897652462124825, | |
| "rewards/accuracy_reward": 0.9015287756919861, | |
| "rewards/format_reward": 1.0, | |
| "step": 741 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.91081237792969, | |
| "epoch": 8.528735632183908, | |
| "grad_norm": 1.5182426183899522, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.471264367816092e-07, | |
| "loss": 0.002, | |
| "reward": 1.9014829397201538, | |
| "reward_std": 0.020442795008420944, | |
| "rewards/accuracy_reward": 0.9021339416503906, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 742 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.91536712646484, | |
| "epoch": 8.540229885057471, | |
| "grad_norm": 1.5309622912667127, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1.4597701149425288e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9063705205917358, | |
| "reward_std": 0.019261373206973076, | |
| "rewards/accuracy_reward": 0.9070214629173279, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 743 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.28190612792969, | |
| "epoch": 8.551724137931034, | |
| "grad_norm": 1.4853282790643823, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.4482758620689654e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9033745527267456, | |
| "reward_std": 0.01895163580775261, | |
| "rewards/accuracy_reward": 0.904025673866272, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 744 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.767578125, | |
| "epoch": 8.563218390804598, | |
| "grad_norm": 1.5019420177279086, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.436781609195402e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8998239040374756, | |
| "reward_std": 0.021759074181318283, | |
| "rewards/accuracy_reward": 0.9004749059677124, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 745 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.77018737792969, | |
| "epoch": 8.574712643678161, | |
| "grad_norm": 1.5414148798759049, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.425287356321839e-07, | |
| "loss": 0.002, | |
| "reward": 1.9003026485443115, | |
| "reward_std": 0.021097760647535324, | |
| "rewards/accuracy_reward": 0.9009537100791931, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 746 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.10872650146484, | |
| "epoch": 8.586206896551724, | |
| "grad_norm": 4.960221853934078, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.413793103448276e-07, | |
| "loss": 0.0018, | |
| "reward": 1.920248031616211, | |
| "reward_std": 0.01949543133378029, | |
| "rewards/accuracy_reward": 0.9208990931510925, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 747 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.27409362792969, | |
| "epoch": 8.597701149425287, | |
| "grad_norm": 1.464489207745014, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.4022988505747128e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9014743566513062, | |
| "reward_std": 0.017846036702394485, | |
| "rewards/accuracy_reward": 0.9014744162559509, | |
| "rewards/format_reward": 1.0, | |
| "step": 748 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.01302337646484, | |
| "epoch": 8.60919540229885, | |
| "grad_norm": 5.576587184664697, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1.3908045977011494e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8989577293395996, | |
| "reward_std": 0.019103601574897766, | |
| "rewards/accuracy_reward": 0.8989577293395996, | |
| "rewards/format_reward": 1.0, | |
| "step": 749 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.55208587646484, | |
| "epoch": 8.620689655172415, | |
| "grad_norm": 3.391688889257715, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1.379310344827586e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9022835493087769, | |
| "reward_std": 0.020112819969654083, | |
| "rewards/accuracy_reward": 0.9022834300994873, | |
| "rewards/format_reward": 1.0, | |
| "step": 750 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.23112487792969, | |
| "epoch": 8.632183908045977, | |
| "grad_norm": 1.415131831015453, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.367816091954023e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8941152095794678, | |
| "reward_std": 0.02364903688430786, | |
| "rewards/accuracy_reward": 0.894115149974823, | |
| "rewards/format_reward": 1.0, | |
| "step": 751 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.39974212646484, | |
| "epoch": 8.64367816091954, | |
| "grad_norm": 3.686963959842639, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1.3563218390804598e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8956639766693115, | |
| "reward_std": 0.01937917247414589, | |
| "rewards/accuracy_reward": 0.8956639170646667, | |
| "rewards/format_reward": 1.0, | |
| "step": 752 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.33659362792969, | |
| "epoch": 8.655172413793103, | |
| "grad_norm": 11.793387997885002, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1.3448275862068965e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9092130661010742, | |
| "reward_std": 0.020166244357824326, | |
| "rewards/accuracy_reward": 0.9098643064498901, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 753 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.69075775146484, | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 1.419014822654193, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8976190090179443, | |
| "reward_std": 0.018913621082901955, | |
| "rewards/accuracy_reward": 0.8976188898086548, | |
| "rewards/format_reward": 1.0, | |
| "step": 754 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.52799987792969, | |
| "epoch": 8.678160919540229, | |
| "grad_norm": 5.95529479625069, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1.32183908045977e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9074852466583252, | |
| "reward_std": 0.017688315361738205, | |
| "rewards/accuracy_reward": 0.9074852466583252, | |
| "rewards/format_reward": 1.0, | |
| "step": 755 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.25521087646484, | |
| "epoch": 8.689655172413794, | |
| "grad_norm": 2.7625404647260265, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.310344827586207e-07, | |
| "loss": 0.002, | |
| "reward": 1.8992561101913452, | |
| "reward_std": 0.021415000781416893, | |
| "rewards/accuracy_reward": 0.899907112121582, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 756 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.72721862792969, | |
| "epoch": 8.701149425287356, | |
| "grad_norm": 3.132870047881132, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.2988505747126435e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8953626155853271, | |
| "reward_std": 0.01965472660958767, | |
| "rewards/accuracy_reward": 0.8953627347946167, | |
| "rewards/format_reward": 1.0, | |
| "step": 757 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.06185150146484, | |
| "epoch": 8.71264367816092, | |
| "grad_norm": 1.4067186469745852, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1.2873563218390804e-07, | |
| "loss": 0.002, | |
| "reward": 1.9052462577819824, | |
| "reward_std": 0.019855869933962822, | |
| "rewards/accuracy_reward": 0.9052464365959167, | |
| "rewards/format_reward": 1.0, | |
| "step": 758 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.248046875, | |
| "epoch": 8.724137931034482, | |
| "grad_norm": 1.6124971541846125, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.2758620689655173e-07, | |
| "loss": 0.002, | |
| "reward": 1.9074506759643555, | |
| "reward_std": 0.017859049141407013, | |
| "rewards/accuracy_reward": 0.9074506759643555, | |
| "rewards/format_reward": 1.0, | |
| "step": 759 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.123046875, | |
| "epoch": 8.735632183908045, | |
| "grad_norm": 1.6651646299458338, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1.2643678160919542e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8970210552215576, | |
| "reward_std": 0.01788368821144104, | |
| "rewards/accuracy_reward": 0.8970209956169128, | |
| "rewards/format_reward": 1.0, | |
| "step": 760 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.68099212646484, | |
| "epoch": 8.74712643678161, | |
| "grad_norm": 1.908276402264626, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.2528735632183906e-07, | |
| "loss": 0.002, | |
| "reward": 1.911945104598999, | |
| "reward_std": 0.020722268149256706, | |
| "rewards/accuracy_reward": 0.9125961065292358, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 761 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.97396087646484, | |
| "epoch": 8.758620689655173, | |
| "grad_norm": 1.909218993992126, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.2413793103448275e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9106658697128296, | |
| "reward_std": 0.017978399991989136, | |
| "rewards/accuracy_reward": 0.9106658697128296, | |
| "rewards/format_reward": 1.0, | |
| "step": 762 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.734375, | |
| "epoch": 8.770114942528735, | |
| "grad_norm": 2.2467623211703014, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.2298850574712644e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9027222394943237, | |
| "reward_std": 0.016572915017604828, | |
| "rewards/accuracy_reward": 0.9027222394943237, | |
| "rewards/format_reward": 1.0, | |
| "step": 763 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.62630462646484, | |
| "epoch": 8.781609195402298, | |
| "grad_norm": 5.599771626606617, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.218390804597701e-07, | |
| "loss": 0.002, | |
| "reward": 1.9221599102020264, | |
| "reward_std": 0.016314268112182617, | |
| "rewards/accuracy_reward": 0.9221599102020264, | |
| "rewards/format_reward": 1.0, | |
| "step": 764 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.12760925292969, | |
| "epoch": 8.793103448275861, | |
| "grad_norm": 2.4498609839357823, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.206896551724138e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9224801063537598, | |
| "reward_std": 0.01809362694621086, | |
| "rewards/accuracy_reward": 0.9224801063537598, | |
| "rewards/format_reward": 1.0, | |
| "step": 765 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.19857025146484, | |
| "epoch": 8.804597701149426, | |
| "grad_norm": 2.5801196829452584, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1.1954022988505745e-07, | |
| "loss": 0.0018, | |
| "reward": 1.898393154144287, | |
| "reward_std": 0.01924772374331951, | |
| "rewards/accuracy_reward": 0.8983932733535767, | |
| "rewards/format_reward": 1.0, | |
| "step": 766 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.40690612792969, | |
| "epoch": 8.816091954022989, | |
| "grad_norm": 2.032967972913373, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.1839080459770114e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9099705219268799, | |
| "reward_std": 0.02100074663758278, | |
| "rewards/accuracy_reward": 0.9106216430664062, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 767 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.61653900146484, | |
| "epoch": 8.827586206896552, | |
| "grad_norm": 2.07582370315509, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.1724137931034482e-07, | |
| "loss": 0.002, | |
| "reward": 1.9123599529266357, | |
| "reward_std": 0.018274664878845215, | |
| "rewards/accuracy_reward": 0.912359893321991, | |
| "rewards/format_reward": 1.0, | |
| "step": 768 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.04817962646484, | |
| "epoch": 8.839080459770114, | |
| "grad_norm": 2.1468683147084873, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.1609195402298851e-07, | |
| "loss": 0.0019, | |
| "reward": 1.905731439590454, | |
| "reward_std": 0.021076953038573265, | |
| "rewards/accuracy_reward": 0.9057313799858093, | |
| "rewards/format_reward": 1.0, | |
| "step": 769 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.3046875, | |
| "epoch": 8.850574712643677, | |
| "grad_norm": 1.5666594265928468, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.1494252873563217e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9021600484848022, | |
| "reward_std": 0.021117765456438065, | |
| "rewards/accuracy_reward": 0.902160108089447, | |
| "rewards/format_reward": 1.0, | |
| "step": 770 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.53255462646484, | |
| "epoch": 8.862068965517242, | |
| "grad_norm": 3.9234683438946254, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.1379310344827586e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9176257848739624, | |
| "reward_std": 0.018359411507844925, | |
| "rewards/accuracy_reward": 0.9176258444786072, | |
| "rewards/format_reward": 1.0, | |
| "step": 771 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.14974212646484, | |
| "epoch": 8.873563218390805, | |
| "grad_norm": 3.472231746273925, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1.1264367816091953e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9001665115356445, | |
| "reward_std": 0.018229342997074127, | |
| "rewards/accuracy_reward": 0.9001666903495789, | |
| "rewards/format_reward": 1.0, | |
| "step": 772 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.37825775146484, | |
| "epoch": 8.885057471264368, | |
| "grad_norm": 3.1028518982061635, | |
| "kl": 0.046875, | |
| "learning_rate": 1.1149425287356322e-07, | |
| "loss": 0.0019, | |
| "reward": 1.911559820175171, | |
| "reward_std": 0.018318263813853264, | |
| "rewards/accuracy_reward": 0.9115597605705261, | |
| "rewards/format_reward": 1.0, | |
| "step": 773 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.07487487792969, | |
| "epoch": 8.89655172413793, | |
| "grad_norm": 2.1968963906989276, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1.103448275862069e-07, | |
| "loss": 0.002, | |
| "reward": 1.9014253616333008, | |
| "reward_std": 0.02046075090765953, | |
| "rewards/accuracy_reward": 0.9020764827728271, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 774 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.416015625, | |
| "epoch": 8.908045977011493, | |
| "grad_norm": 1.899936745200757, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.0919540229885057e-07, | |
| "loss": 0.002, | |
| "reward": 1.8940951824188232, | |
| "reward_std": 0.017228955402970314, | |
| "rewards/accuracy_reward": 0.8940951228141785, | |
| "rewards/format_reward": 1.0, | |
| "step": 775 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.01692962646484, | |
| "epoch": 8.919540229885058, | |
| "grad_norm": 2.350383361616655, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.0804597701149425e-07, | |
| "loss": 0.002, | |
| "reward": 1.8946069478988647, | |
| "reward_std": 0.01980327069759369, | |
| "rewards/accuracy_reward": 0.8946070075035095, | |
| "rewards/format_reward": 1.0, | |
| "step": 776 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.88411712646484, | |
| "epoch": 8.931034482758621, | |
| "grad_norm": 5.641269463386258, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.0689655172413794e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8930943012237549, | |
| "reward_std": 0.020226947963237762, | |
| "rewards/accuracy_reward": 0.8930944204330444, | |
| "rewards/format_reward": 1.0, | |
| "step": 777 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.46224212646484, | |
| "epoch": 8.942528735632184, | |
| "grad_norm": 1.590527535893143, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.057471264367816e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9104032516479492, | |
| "reward_std": 0.01798829436302185, | |
| "rewards/accuracy_reward": 0.9104033708572388, | |
| "rewards/format_reward": 1.0, | |
| "step": 778 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.85612487792969, | |
| "epoch": 8.954022988505747, | |
| "grad_norm": 2.6107268057573365, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.0459770114942529e-07, | |
| "loss": 0.002, | |
| "reward": 1.9010858535766602, | |
| "reward_std": 0.020739298313856125, | |
| "rewards/accuracy_reward": 0.9010860323905945, | |
| "rewards/format_reward": 1.0, | |
| "step": 779 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.03385925292969, | |
| "epoch": 8.96551724137931, | |
| "grad_norm": 1.6092109607995282, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.0344827586206897e-07, | |
| "loss": 0.0023, | |
| "reward": 1.900530219078064, | |
| "reward_std": 0.01735313981771469, | |
| "rewards/accuracy_reward": 0.900530219078064, | |
| "rewards/format_reward": 1.0, | |
| "step": 780 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.65950775146484, | |
| "epoch": 8.977011494252874, | |
| "grad_norm": 1.4883968181068261, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.0229885057471264e-07, | |
| "loss": 0.002, | |
| "reward": 1.9065711498260498, | |
| "reward_std": 0.016935113817453384, | |
| "rewards/accuracy_reward": 0.9065712690353394, | |
| "rewards/format_reward": 1.0, | |
| "step": 781 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.84635925292969, | |
| "epoch": 8.988505747126437, | |
| "grad_norm": 1.6246833714697817, | |
| "kl": 0.04931640625, | |
| "learning_rate": 1.0114942528735632e-07, | |
| "loss": 0.002, | |
| "reward": 1.91986083984375, | |
| "reward_std": 0.015720397233963013, | |
| "rewards/accuracy_reward": 0.91986083984375, | |
| "rewards/format_reward": 1.0, | |
| "step": 782 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.25140380859375, | |
| "epoch": 9.0, | |
| "grad_norm": 2.248799852714355, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0017, | |
| "reward": 1.899660587310791, | |
| "reward_std": 0.019224824383854866, | |
| "rewards/accuracy_reward": 0.8996607661247253, | |
| "rewards/format_reward": 1.0, | |
| "step": 783 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.44075775146484, | |
| "epoch": 9.011494252873563, | |
| "grad_norm": 1.8721483816760687, | |
| "kl": 0.051513671875, | |
| "learning_rate": 9.885057471264367e-08, | |
| "loss": 0.0021, | |
| "reward": 1.8844293355941772, | |
| "reward_std": 0.025172477588057518, | |
| "rewards/accuracy_reward": 0.8857313990592957, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 784 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.6640625, | |
| "epoch": 9.022988505747126, | |
| "grad_norm": 1.8506016770380465, | |
| "kl": 0.04150390625, | |
| "learning_rate": 9.770114942528736e-08, | |
| "loss": 0.0017, | |
| "reward": 1.911178469657898, | |
| "reward_std": 0.021104078739881516, | |
| "rewards/accuracy_reward": 0.911178469657898, | |
| "rewards/format_reward": 1.0, | |
| "step": 785 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.47526550292969, | |
| "epoch": 9.03448275862069, | |
| "grad_norm": 2.4400240165615634, | |
| "kl": 0.04248046875, | |
| "learning_rate": 9.655172413793103e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9191011190414429, | |
| "reward_std": 0.016874711960554123, | |
| "rewards/accuracy_reward": 0.9191012382507324, | |
| "rewards/format_reward": 1.0, | |
| "step": 786 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.58268737792969, | |
| "epoch": 9.045977011494253, | |
| "grad_norm": 3.3949879642064476, | |
| "kl": 0.04150390625, | |
| "learning_rate": 9.540229885057471e-08, | |
| "loss": 0.0017, | |
| "reward": 1.8661808967590332, | |
| "reward_std": 0.02081323228776455, | |
| "rewards/accuracy_reward": 0.8661808967590332, | |
| "rewards/format_reward": 1.0, | |
| "step": 787 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.15234375, | |
| "epoch": 9.057471264367816, | |
| "grad_norm": 1.407910897768254, | |
| "kl": 0.051025390625, | |
| "learning_rate": 9.425287356321839e-08, | |
| "loss": 0.0021, | |
| "reward": 1.8680272102355957, | |
| "reward_std": 0.02056352235376835, | |
| "rewards/accuracy_reward": 0.8680272102355957, | |
| "rewards/format_reward": 1.0, | |
| "step": 788 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.87956237792969, | |
| "epoch": 9.068965517241379, | |
| "grad_norm": 2.6980150332453334, | |
| "kl": 0.04443359375, | |
| "learning_rate": 9.310344827586207e-08, | |
| "loss": 0.0018, | |
| "reward": 1.885411262512207, | |
| "reward_std": 0.021717345342040062, | |
| "rewards/accuracy_reward": 0.885411262512207, | |
| "rewards/format_reward": 1.0, | |
| "step": 789 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.94010925292969, | |
| "epoch": 9.080459770114942, | |
| "grad_norm": 1.6808051261990247, | |
| "kl": 0.05615234375, | |
| "learning_rate": 9.195402298850574e-08, | |
| "loss": 0.0023, | |
| "reward": 1.901023268699646, | |
| "reward_std": 0.021362271159887314, | |
| "rewards/accuracy_reward": 0.9010233879089355, | |
| "rewards/format_reward": 1.0, | |
| "step": 790 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.22721862792969, | |
| "epoch": 9.091954022988507, | |
| "grad_norm": 1.9651267648591069, | |
| "kl": 0.048828125, | |
| "learning_rate": 9.080459770114942e-08, | |
| "loss": 0.002, | |
| "reward": 1.8997548818588257, | |
| "reward_std": 0.018945304676890373, | |
| "rewards/accuracy_reward": 0.8997548818588257, | |
| "rewards/format_reward": 1.0, | |
| "step": 791 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.98046875, | |
| "epoch": 9.10344827586207, | |
| "grad_norm": 1.5615310862319665, | |
| "kl": 0.0458984375, | |
| "learning_rate": 8.96551724137931e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9096624851226807, | |
| "reward_std": 0.018772246316075325, | |
| "rewards/accuracy_reward": 0.9096624851226807, | |
| "rewards/format_reward": 1.0, | |
| "step": 792 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.50456237792969, | |
| "epoch": 9.114942528735632, | |
| "grad_norm": 1.8984567348891395, | |
| "kl": 0.053466796875, | |
| "learning_rate": 8.850574712643679e-08, | |
| "loss": 0.0022, | |
| "reward": 1.8971011638641357, | |
| "reward_std": 0.02134164236485958, | |
| "rewards/accuracy_reward": 0.8971012830734253, | |
| "rewards/format_reward": 1.0, | |
| "step": 793 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.59765625, | |
| "epoch": 9.126436781609195, | |
| "grad_norm": 1.57123496870828, | |
| "kl": 0.05126953125, | |
| "learning_rate": 8.735632183908045e-08, | |
| "loss": 0.0021, | |
| "reward": 1.8993648290634155, | |
| "reward_std": 0.02058519423007965, | |
| "rewards/accuracy_reward": 0.8993649482727051, | |
| "rewards/format_reward": 1.0, | |
| "step": 794 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.01692962646484, | |
| "epoch": 9.137931034482758, | |
| "grad_norm": 2.1980038174190373, | |
| "kl": 0.038818359375, | |
| "learning_rate": 8.620689655172414e-08, | |
| "loss": 0.0016, | |
| "reward": 1.8727396726608276, | |
| "reward_std": 0.021485071629285812, | |
| "rewards/accuracy_reward": 0.8727396726608276, | |
| "rewards/format_reward": 1.0, | |
| "step": 795 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.57682800292969, | |
| "epoch": 9.149425287356323, | |
| "grad_norm": 1.5298307460022529, | |
| "kl": 0.044921875, | |
| "learning_rate": 8.505747126436782e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9137659072875977, | |
| "reward_std": 0.01766360178589821, | |
| "rewards/accuracy_reward": 0.9137659072875977, | |
| "rewards/format_reward": 1.0, | |
| "step": 796 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.77474212646484, | |
| "epoch": 9.160919540229886, | |
| "grad_norm": 1.801682202262651, | |
| "kl": 0.04296875, | |
| "learning_rate": 8.39080459770115e-08, | |
| "loss": 0.0018, | |
| "reward": 1.8823485374450684, | |
| "reward_std": 0.023980390280485153, | |
| "rewards/accuracy_reward": 0.8829997777938843, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 797 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.94596862792969, | |
| "epoch": 9.172413793103448, | |
| "grad_norm": 2.2238602851803284, | |
| "kl": 0.041015625, | |
| "learning_rate": 8.275862068965517e-08, | |
| "loss": 0.0017, | |
| "reward": 1.8901784420013428, | |
| "reward_std": 0.021750375628471375, | |
| "rewards/accuracy_reward": 0.8908295631408691, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 798 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.58464050292969, | |
| "epoch": 9.183908045977011, | |
| "grad_norm": 3.448825022079825, | |
| "kl": 0.042724609375, | |
| "learning_rate": 8.160919540229885e-08, | |
| "loss": 0.0018, | |
| "reward": 1.902247428894043, | |
| "reward_std": 0.020935572683811188, | |
| "rewards/accuracy_reward": 0.9028984904289246, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 799 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.63216400146484, | |
| "epoch": 9.195402298850574, | |
| "grad_norm": 1.7683408378496788, | |
| "kl": 0.041259765625, | |
| "learning_rate": 8.045977011494252e-08, | |
| "loss": 0.0017, | |
| "reward": 1.8915985822677612, | |
| "reward_std": 0.02043468877673149, | |
| "rewards/accuracy_reward": 0.891598641872406, | |
| "rewards/format_reward": 1.0, | |
| "step": 800 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.3046875, | |
| "epoch": 9.206896551724139, | |
| "grad_norm": 3.4479135032415082, | |
| "kl": 0.0419921875, | |
| "learning_rate": 7.931034482758621e-08, | |
| "loss": 0.0018, | |
| "reward": 1.8980789184570312, | |
| "reward_std": 0.020334240049123764, | |
| "rewards/accuracy_reward": 0.8980789184570312, | |
| "rewards/format_reward": 1.0, | |
| "step": 801 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.453125, | |
| "epoch": 9.218390804597702, | |
| "grad_norm": 2.8315233473682286, | |
| "kl": 0.0439453125, | |
| "learning_rate": 7.816091954022988e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9106957912445068, | |
| "reward_std": 0.01965712383389473, | |
| "rewards/accuracy_reward": 0.9106957912445068, | |
| "rewards/format_reward": 1.0, | |
| "step": 802 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.0078125, | |
| "epoch": 9.229885057471265, | |
| "grad_norm": 8.809973822334465, | |
| "kl": 0.04443359375, | |
| "learning_rate": 7.701149425287357e-08, | |
| "loss": 0.0018, | |
| "reward": 1.8973687887191772, | |
| "reward_std": 0.02227511629462242, | |
| "rewards/accuracy_reward": 0.8973686695098877, | |
| "rewards/format_reward": 1.0, | |
| "step": 803 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.857421875, | |
| "epoch": 9.241379310344827, | |
| "grad_norm": 2.290511339554376, | |
| "kl": 0.048095703125, | |
| "learning_rate": 7.586206896551724e-08, | |
| "loss": 0.002, | |
| "reward": 1.926220417022705, | |
| "reward_std": 0.020036086440086365, | |
| "rewards/accuracy_reward": 0.9268714785575867, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 804 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.24153900146484, | |
| "epoch": 9.25287356321839, | |
| "grad_norm": 10.259694200109895, | |
| "kl": 0.04541015625, | |
| "learning_rate": 7.471264367816092e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9228681325912476, | |
| "reward_std": 0.02166038751602173, | |
| "rewards/accuracy_reward": 0.9228681325912476, | |
| "rewards/format_reward": 1.0, | |
| "step": 805 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.47786712646484, | |
| "epoch": 9.264367816091955, | |
| "grad_norm": 1.888552839892748, | |
| "kl": 0.040771484375, | |
| "learning_rate": 7.35632183908046e-08, | |
| "loss": 0.0017, | |
| "reward": 1.905164122581482, | |
| "reward_std": 0.017032243311405182, | |
| "rewards/accuracy_reward": 0.9051642417907715, | |
| "rewards/format_reward": 1.0, | |
| "step": 806 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.90560150146484, | |
| "epoch": 9.275862068965518, | |
| "grad_norm": 1.5511138521125212, | |
| "kl": 0.052001953125, | |
| "learning_rate": 7.241379310344827e-08, | |
| "loss": 0.0021, | |
| "reward": 1.912177562713623, | |
| "reward_std": 0.018603047356009483, | |
| "rewards/accuracy_reward": 0.912177562713623, | |
| "rewards/format_reward": 1.0, | |
| "step": 807 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.09114837646484, | |
| "epoch": 9.28735632183908, | |
| "grad_norm": 2.2223963738104775, | |
| "kl": 0.04443359375, | |
| "learning_rate": 7.126436781609195e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9001119136810303, | |
| "reward_std": 0.023462196812033653, | |
| "rewards/accuracy_reward": 0.9001118540763855, | |
| "rewards/format_reward": 1.0, | |
| "step": 808 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.89388275146484, | |
| "epoch": 9.298850574712644, | |
| "grad_norm": 6.250219527341955, | |
| "kl": 0.046142578125, | |
| "learning_rate": 7.011494252873564e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9060745239257812, | |
| "reward_std": 0.021167151629924774, | |
| "rewards/accuracy_reward": 0.9067255258560181, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 809 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.06120300292969, | |
| "epoch": 9.310344827586206, | |
| "grad_norm": 3.0663601968471044, | |
| "kl": 0.04248046875, | |
| "learning_rate": 6.89655172413793e-08, | |
| "loss": 0.0018, | |
| "reward": 1.904802680015564, | |
| "reward_std": 0.02016662247478962, | |
| "rewards/accuracy_reward": 0.9054538011550903, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 810 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.279296875, | |
| "epoch": 9.32183908045977, | |
| "grad_norm": 2.1266705913854707, | |
| "kl": 0.06591796875, | |
| "learning_rate": 6.781609195402299e-08, | |
| "loss": 0.0027, | |
| "reward": 1.8952583074569702, | |
| "reward_std": 0.023519501090049744, | |
| "rewards/accuracy_reward": 0.895909309387207, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 811 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.70573425292969, | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 1.2920255105728329, | |
| "kl": 0.041259765625, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 0.0017, | |
| "reward": 1.9121431112289429, | |
| "reward_std": 0.01818576082587242, | |
| "rewards/accuracy_reward": 0.9121431112289429, | |
| "rewards/format_reward": 1.0, | |
| "step": 812 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.04948425292969, | |
| "epoch": 9.344827586206897, | |
| "grad_norm": 1.9698975522109214, | |
| "kl": 0.044189453125, | |
| "learning_rate": 6.551724137931034e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9061200618743896, | |
| "reward_std": 0.019229721277952194, | |
| "rewards/accuracy_reward": 0.9067711234092712, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 813 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.99479675292969, | |
| "epoch": 9.35632183908046, | |
| "grad_norm": 1.5466544483838331, | |
| "kl": 0.044677734375, | |
| "learning_rate": 6.436781609195402e-08, | |
| "loss": 0.0019, | |
| "reward": 1.8965463638305664, | |
| "reward_std": 0.026135286316275597, | |
| "rewards/accuracy_reward": 0.8978484869003296, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 814 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.06771087646484, | |
| "epoch": 9.367816091954023, | |
| "grad_norm": 2.9008920163167327, | |
| "kl": 0.04345703125, | |
| "learning_rate": 6.321839080459771e-08, | |
| "loss": 0.0018, | |
| "reward": 1.8903131484985352, | |
| "reward_std": 0.020935822278261185, | |
| "rewards/accuracy_reward": 0.8903131484985352, | |
| "rewards/format_reward": 1.0, | |
| "step": 815 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.97396087646484, | |
| "epoch": 9.379310344827585, | |
| "grad_norm": 2.2134917020578797, | |
| "kl": 0.043212890625, | |
| "learning_rate": 6.206896551724137e-08, | |
| "loss": 0.0018, | |
| "reward": 1.894513487815857, | |
| "reward_std": 0.019658654928207397, | |
| "rewards/accuracy_reward": 0.8945134878158569, | |
| "rewards/format_reward": 1.0, | |
| "step": 816 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.47135925292969, | |
| "epoch": 9.39080459770115, | |
| "grad_norm": 2.0451132122580216, | |
| "kl": 0.044189453125, | |
| "learning_rate": 6.091954022988505e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9205522537231445, | |
| "reward_std": 0.022365611046552658, | |
| "rewards/accuracy_reward": 0.9205522537231445, | |
| "rewards/format_reward": 1.0, | |
| "step": 817 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.357421875, | |
| "epoch": 9.402298850574713, | |
| "grad_norm": 1.5413854537310678, | |
| "kl": 0.04345703125, | |
| "learning_rate": 5.977011494252873e-08, | |
| "loss": 0.0018, | |
| "reward": 1.906002402305603, | |
| "reward_std": 0.020225465297698975, | |
| "rewards/accuracy_reward": 0.9060024619102478, | |
| "rewards/format_reward": 1.0, | |
| "step": 818 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.248046875, | |
| "epoch": 9.413793103448276, | |
| "grad_norm": 1.5942001444003233, | |
| "kl": 0.040771484375, | |
| "learning_rate": 5.862068965517241e-08, | |
| "loss": 0.0017, | |
| "reward": 1.9052398204803467, | |
| "reward_std": 0.018013250082731247, | |
| "rewards/accuracy_reward": 0.9052395820617676, | |
| "rewards/format_reward": 1.0, | |
| "step": 819 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.83528900146484, | |
| "epoch": 9.425287356321839, | |
| "grad_norm": 2.321727658388936, | |
| "kl": 0.04248046875, | |
| "learning_rate": 5.747126436781609e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9112927913665771, | |
| "reward_std": 0.020977940410375595, | |
| "rewards/accuracy_reward": 0.911943793296814, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 820 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.5625, | |
| "epoch": 9.436781609195402, | |
| "grad_norm": 3.718818284558716, | |
| "kl": 0.0458984375, | |
| "learning_rate": 5.6321839080459764e-08, | |
| "loss": 0.002, | |
| "reward": 1.9173364639282227, | |
| "reward_std": 0.01930234208703041, | |
| "rewards/accuracy_reward": 0.9173363447189331, | |
| "rewards/format_reward": 1.0, | |
| "step": 821 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.20833587646484, | |
| "epoch": 9.448275862068966, | |
| "grad_norm": 1.7447738271778865, | |
| "kl": 0.03857421875, | |
| "learning_rate": 5.517241379310345e-08, | |
| "loss": 0.0016, | |
| "reward": 1.905761480331421, | |
| "reward_std": 0.017823033034801483, | |
| "rewards/accuracy_reward": 0.9057614207267761, | |
| "rewards/format_reward": 1.0, | |
| "step": 822 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.22005462646484, | |
| "epoch": 9.459770114942529, | |
| "grad_norm": 5.556438363038334, | |
| "kl": 0.044921875, | |
| "learning_rate": 5.402298850574712e-08, | |
| "loss": 0.0019, | |
| "reward": 1.912406325340271, | |
| "reward_std": 0.017947331070899963, | |
| "rewards/accuracy_reward": 0.912406325340271, | |
| "rewards/format_reward": 1.0, | |
| "step": 823 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.21354675292969, | |
| "epoch": 9.471264367816092, | |
| "grad_norm": 1.564280665225921, | |
| "kl": 0.04296875, | |
| "learning_rate": 5.28735632183908e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9196040630340576, | |
| "reward_std": 0.018538126721978188, | |
| "rewards/accuracy_reward": 0.9196040034294128, | |
| "rewards/format_reward": 1.0, | |
| "step": 824 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.55729675292969, | |
| "epoch": 9.482758620689655, | |
| "grad_norm": 2.4144959847961758, | |
| "kl": 0.0439453125, | |
| "learning_rate": 5.172413793103448e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9196009635925293, | |
| "reward_std": 0.016086673364043236, | |
| "rewards/accuracy_reward": 0.9196010828018188, | |
| "rewards/format_reward": 1.0, | |
| "step": 825 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.38021087646484, | |
| "epoch": 9.494252873563218, | |
| "grad_norm": 4.197005794788358, | |
| "kl": 0.04345703125, | |
| "learning_rate": 5.057471264367816e-08, | |
| "loss": 0.0019, | |
| "reward": 1.898109793663025, | |
| "reward_std": 0.021654874086380005, | |
| "rewards/accuracy_reward": 0.8987607955932617, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 826 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.59310150146484, | |
| "epoch": 9.505747126436782, | |
| "grad_norm": 1.8296996407650712, | |
| "kl": 0.04345703125, | |
| "learning_rate": 4.9425287356321836e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9078954458236694, | |
| "reward_std": 0.018013805150985718, | |
| "rewards/accuracy_reward": 0.907895565032959, | |
| "rewards/format_reward": 1.0, | |
| "step": 827 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.60677337646484, | |
| "epoch": 9.517241379310345, | |
| "grad_norm": 1.6814305912226317, | |
| "kl": 0.041259765625, | |
| "learning_rate": 4.827586206896551e-08, | |
| "loss": 0.0017, | |
| "reward": 1.8963592052459717, | |
| "reward_std": 0.022151313722133636, | |
| "rewards/accuracy_reward": 0.8963589668273926, | |
| "rewards/format_reward": 1.0, | |
| "step": 828 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.73567962646484, | |
| "epoch": 9.528735632183908, | |
| "grad_norm": 1.7924163077233033, | |
| "kl": 0.04296875, | |
| "learning_rate": 4.7126436781609196e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9191563129425049, | |
| "reward_std": 0.01933225616812706, | |
| "rewards/accuracy_reward": 0.9198073744773865, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 829 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.1171875, | |
| "epoch": 9.540229885057471, | |
| "grad_norm": 2.5340920833447895, | |
| "kl": 0.0439453125, | |
| "learning_rate": 4.597701149425287e-08, | |
| "loss": 0.0018, | |
| "reward": 1.89328932762146, | |
| "reward_std": 0.01949036866426468, | |
| "rewards/accuracy_reward": 0.8939403295516968, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 830 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.53255462646484, | |
| "epoch": 9.551724137931034, | |
| "grad_norm": 2.483443690558432, | |
| "kl": 0.052490234375, | |
| "learning_rate": 4.482758620689655e-08, | |
| "loss": 0.0022, | |
| "reward": 1.8969571590423584, | |
| "reward_std": 0.017558250576257706, | |
| "rewards/accuracy_reward": 0.8969571590423584, | |
| "rewards/format_reward": 1.0, | |
| "step": 831 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.533203125, | |
| "epoch": 9.563218390804598, | |
| "grad_norm": 1.291578178291959, | |
| "kl": 0.045166015625, | |
| "learning_rate": 4.3678160919540225e-08, | |
| "loss": 0.0019, | |
| "reward": 1.8948633670806885, | |
| "reward_std": 0.021366355940699577, | |
| "rewards/accuracy_reward": 0.8948633074760437, | |
| "rewards/format_reward": 1.0, | |
| "step": 832 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82487487792969, | |
| "epoch": 9.574712643678161, | |
| "grad_norm": 2.7537847562643374, | |
| "kl": 0.0419921875, | |
| "learning_rate": 4.252873563218391e-08, | |
| "loss": 0.0017, | |
| "reward": 1.9019429683685303, | |
| "reward_std": 0.018435677513480186, | |
| "rewards/accuracy_reward": 0.9019430875778198, | |
| "rewards/format_reward": 1.0, | |
| "step": 833 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.77799987792969, | |
| "epoch": 9.586206896551724, | |
| "grad_norm": 2.2181518752294624, | |
| "kl": 0.04443359375, | |
| "learning_rate": 4.1379310344827585e-08, | |
| "loss": 0.0018, | |
| "reward": 1.8903684616088867, | |
| "reward_std": 0.018676333129405975, | |
| "rewards/accuracy_reward": 0.8903685808181763, | |
| "rewards/format_reward": 1.0, | |
| "step": 834 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.46224212646484, | |
| "epoch": 9.597701149425287, | |
| "grad_norm": 1.756566376667342, | |
| "kl": 0.046875, | |
| "learning_rate": 4.022988505747126e-08, | |
| "loss": 0.0019, | |
| "reward": 1.8650926351547241, | |
| "reward_std": 0.02378205582499504, | |
| "rewards/accuracy_reward": 0.8657435178756714, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 835 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.02474212646484, | |
| "epoch": 9.60919540229885, | |
| "grad_norm": 1.99335590706468, | |
| "kl": 0.04345703125, | |
| "learning_rate": 3.908045977011494e-08, | |
| "loss": 0.0018, | |
| "reward": 1.8908905982971191, | |
| "reward_std": 0.01960897445678711, | |
| "rewards/accuracy_reward": 0.8915416598320007, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 836 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.8046875, | |
| "epoch": 9.620689655172415, | |
| "grad_norm": 4.785304870304732, | |
| "kl": 0.04345703125, | |
| "learning_rate": 3.793103448275862e-08, | |
| "loss": 0.0018, | |
| "reward": 1.909947395324707, | |
| "reward_std": 0.0192283745855093, | |
| "rewards/accuracy_reward": 0.9099475741386414, | |
| "rewards/format_reward": 1.0, | |
| "step": 837 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.50456237792969, | |
| "epoch": 9.632183908045977, | |
| "grad_norm": 1.6423863350528298, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.67816091954023e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9007571935653687, | |
| "reward_std": 0.017737818881869316, | |
| "rewards/accuracy_reward": 0.9007573127746582, | |
| "rewards/format_reward": 1.0, | |
| "step": 838 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.982421875, | |
| "epoch": 9.64367816091954, | |
| "grad_norm": 2.049656890092837, | |
| "kl": 0.043701171875, | |
| "learning_rate": 3.5632183908045974e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9175758361816406, | |
| "reward_std": 0.01626306213438511, | |
| "rewards/accuracy_reward": 0.9175758361816406, | |
| "rewards/format_reward": 1.0, | |
| "step": 839 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.10417175292969, | |
| "epoch": 9.655172413793103, | |
| "grad_norm": 1.8848940408267045, | |
| "kl": 0.045654296875, | |
| "learning_rate": 3.448275862068965e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9053196907043457, | |
| "reward_std": 0.022480791434645653, | |
| "rewards/accuracy_reward": 0.9059707522392273, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 840 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.12890625, | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 3.234936396417619, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.3333333333333334e-08, | |
| "loss": 0.0019, | |
| "reward": 1.905747413635254, | |
| "reward_std": 0.02039037086069584, | |
| "rewards/accuracy_reward": 0.9057474136352539, | |
| "rewards/format_reward": 1.0, | |
| "step": 841 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.443359375, | |
| "epoch": 9.678160919540229, | |
| "grad_norm": 1.7322466630483822, | |
| "kl": 0.04443359375, | |
| "learning_rate": 3.218390804597701e-08, | |
| "loss": 0.0018, | |
| "reward": 1.906328558921814, | |
| "reward_std": 0.022260412573814392, | |
| "rewards/accuracy_reward": 0.9076308012008667, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 842 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.857421875, | |
| "epoch": 9.689655172413794, | |
| "grad_norm": 1.7097166944224358, | |
| "kl": 0.05224609375, | |
| "learning_rate": 3.103448275862069e-08, | |
| "loss": 0.0022, | |
| "reward": 1.9105417728424072, | |
| "reward_std": 0.017679734155535698, | |
| "rewards/accuracy_reward": 0.9105417728424072, | |
| "rewards/format_reward": 1.0, | |
| "step": 843 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.515625, | |
| "epoch": 9.701149425287356, | |
| "grad_norm": 1.884287958847617, | |
| "kl": 0.047119140625, | |
| "learning_rate": 2.9885057471264364e-08, | |
| "loss": 0.002, | |
| "reward": 1.9169318675994873, | |
| "reward_std": 0.020296216011047363, | |
| "rewards/accuracy_reward": 0.9182338714599609, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 844 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.80143737792969, | |
| "epoch": 9.71264367816092, | |
| "grad_norm": 1.9163910071570234, | |
| "kl": 0.0458984375, | |
| "learning_rate": 2.8735632183908043e-08, | |
| "loss": 0.002, | |
| "reward": 1.9143104553222656, | |
| "reward_std": 0.019828440621495247, | |
| "rewards/accuracy_reward": 0.9149616956710815, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 845 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.53190612792969, | |
| "epoch": 9.724137931034482, | |
| "grad_norm": 1.73873304639946, | |
| "kl": 0.0439453125, | |
| "learning_rate": 2.7586206896551723e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9067649841308594, | |
| "reward_std": 0.018557554110884666, | |
| "rewards/accuracy_reward": 0.9067651033401489, | |
| "rewards/format_reward": 1.0, | |
| "step": 846 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.27604675292969, | |
| "epoch": 9.735632183908045, | |
| "grad_norm": 2.3263503442259807, | |
| "kl": 0.04345703125, | |
| "learning_rate": 2.64367816091954e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9176081418991089, | |
| "reward_std": 0.018732137978076935, | |
| "rewards/accuracy_reward": 0.9176082015037537, | |
| "rewards/format_reward": 1.0, | |
| "step": 847 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.03125, | |
| "epoch": 9.74712643678161, | |
| "grad_norm": 1.495014646664977, | |
| "kl": 0.04931640625, | |
| "learning_rate": 2.528735632183908e-08, | |
| "loss": 0.0021, | |
| "reward": 1.914261817932129, | |
| "reward_std": 0.01746547594666481, | |
| "rewards/accuracy_reward": 0.9142619371414185, | |
| "rewards/format_reward": 1.0, | |
| "step": 848 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.43099212646484, | |
| "epoch": 9.758620689655173, | |
| "grad_norm": 2.129213121110771, | |
| "kl": 0.044189453125, | |
| "learning_rate": 2.4137931034482756e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9081742763519287, | |
| "reward_std": 0.02037705108523369, | |
| "rewards/accuracy_reward": 0.9088253378868103, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 849 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.64453125, | |
| "epoch": 9.770114942528735, | |
| "grad_norm": 1.5696963468743126, | |
| "kl": 0.046142578125, | |
| "learning_rate": 2.2988505747126436e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9174790382385254, | |
| "reward_std": 0.017753083258867264, | |
| "rewards/accuracy_reward": 0.9174790382385254, | |
| "rewards/format_reward": 1.0, | |
| "step": 850 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.01302337646484, | |
| "epoch": 9.781609195402298, | |
| "grad_norm": 2.3977935164505286, | |
| "kl": 0.050537109375, | |
| "learning_rate": 2.1839080459770113e-08, | |
| "loss": 0.0021, | |
| "reward": 1.8931522369384766, | |
| "reward_std": 0.019253626465797424, | |
| "rewards/accuracy_reward": 0.8931522369384766, | |
| "rewards/format_reward": 1.0, | |
| "step": 851 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.55339050292969, | |
| "epoch": 9.793103448275861, | |
| "grad_norm": 2.3423979754131383, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.0689655172413793e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9183381795883179, | |
| "reward_std": 0.019186435267329216, | |
| "rewards/accuracy_reward": 0.9183380603790283, | |
| "rewards/format_reward": 1.0, | |
| "step": 852 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.85417175292969, | |
| "epoch": 9.804597701149426, | |
| "grad_norm": 2.073174318294833, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.954022988505747e-08, | |
| "loss": 0.0019, | |
| "reward": 1.890181303024292, | |
| "reward_std": 0.018408436328172684, | |
| "rewards/accuracy_reward": 0.8901811838150024, | |
| "rewards/format_reward": 1.0, | |
| "step": 853 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.06640625, | |
| "epoch": 9.816091954022989, | |
| "grad_norm": 1.9299645375268184, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.839080459770115e-08, | |
| "loss": 0.0021, | |
| "reward": 1.9052647352218628, | |
| "reward_std": 0.017054375261068344, | |
| "rewards/accuracy_reward": 0.9052648544311523, | |
| "rewards/format_reward": 1.0, | |
| "step": 854 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.09896087646484, | |
| "epoch": 9.827586206896552, | |
| "grad_norm": 1.4804834876534272, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1.7241379310344825e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9028480052947998, | |
| "reward_std": 0.017569242045283318, | |
| "rewards/accuracy_reward": 0.9028478860855103, | |
| "rewards/format_reward": 1.0, | |
| "step": 855 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.41862487792969, | |
| "epoch": 9.839080459770114, | |
| "grad_norm": 3.920206306053486, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.6091954022988505e-08, | |
| "loss": 0.0021, | |
| "reward": 1.9126800298690796, | |
| "reward_std": 0.018128346651792526, | |
| "rewards/accuracy_reward": 0.913331151008606, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 856 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.01432800292969, | |
| "epoch": 9.850574712643677, | |
| "grad_norm": 3.684344553078877, | |
| "kl": 0.046875, | |
| "learning_rate": 1.4942528735632182e-08, | |
| "loss": 0.0019, | |
| "reward": 1.8925762176513672, | |
| "reward_std": 0.021131232380867004, | |
| "rewards/accuracy_reward": 0.8925762176513672, | |
| "rewards/format_reward": 1.0, | |
| "step": 857 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.326171875, | |
| "epoch": 9.862068965517242, | |
| "grad_norm": 1.3668989587295126, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.3793103448275862e-08, | |
| "loss": 0.002, | |
| "reward": 1.8880128860473633, | |
| "reward_std": 0.020463991910219193, | |
| "rewards/accuracy_reward": 0.8880130648612976, | |
| "rewards/format_reward": 1.0, | |
| "step": 858 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.76692962646484, | |
| "epoch": 9.873563218390805, | |
| "grad_norm": 2.38568885867465, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1.264367816091954e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9086008071899414, | |
| "reward_std": 0.020096510648727417, | |
| "rewards/accuracy_reward": 0.9086008071899414, | |
| "rewards/format_reward": 1.0, | |
| "step": 859 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.86198425292969, | |
| "epoch": 9.885057471264368, | |
| "grad_norm": 4.216482284140485, | |
| "kl": 0.1513671875, | |
| "learning_rate": 1.1494252873563218e-08, | |
| "loss": 0.0061, | |
| "reward": 1.9235996007919312, | |
| "reward_std": 0.019205166026949883, | |
| "rewards/accuracy_reward": 0.9242507815361023, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 860 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.77409362792969, | |
| "epoch": 9.89655172413793, | |
| "grad_norm": 4.8514292391747675, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.0344827586206896e-08, | |
| "loss": 0.0021, | |
| "reward": 1.9049177169799805, | |
| "reward_std": 0.017066650092601776, | |
| "rewards/accuracy_reward": 0.90491783618927, | |
| "rewards/format_reward": 1.0, | |
| "step": 861 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.80599212646484, | |
| "epoch": 9.908045977011493, | |
| "grad_norm": 1.450621393155795, | |
| "kl": 0.043212890625, | |
| "learning_rate": 9.195402298850574e-09, | |
| "loss": 0.0019, | |
| "reward": 1.9170316457748413, | |
| "reward_std": 0.016973568126559258, | |
| "rewards/accuracy_reward": 0.9170317649841309, | |
| "rewards/format_reward": 1.0, | |
| "step": 862 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.71614837646484, | |
| "epoch": 9.919540229885058, | |
| "grad_norm": 2.5531007269256762, | |
| "kl": 0.04931640625, | |
| "learning_rate": 8.045977011494253e-09, | |
| "loss": 0.002, | |
| "reward": 1.9102344512939453, | |
| "reward_std": 0.0186156053096056, | |
| "rewards/accuracy_reward": 0.9102343916893005, | |
| "rewards/format_reward": 1.0, | |
| "step": 863 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.51432800292969, | |
| "epoch": 9.931034482758621, | |
| "grad_norm": 1.7598054384403226, | |
| "kl": 0.044921875, | |
| "learning_rate": 6.896551724137931e-09, | |
| "loss": 0.0019, | |
| "reward": 1.8904626369476318, | |
| "reward_std": 0.022165637463331223, | |
| "rewards/accuracy_reward": 0.8911136388778687, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 864 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.57292175292969, | |
| "epoch": 9.942528735632184, | |
| "grad_norm": 341994.5350692805, | |
| "kl": 3088.0, | |
| "learning_rate": 5.747126436781609e-09, | |
| "loss": 123.6946, | |
| "reward": 1.9070371389389038, | |
| "reward_std": 0.020475659519433975, | |
| "rewards/accuracy_reward": 0.9076882600784302, | |
| "rewards/format_reward": 0.9993489980697632, | |
| "step": 865 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.62956237792969, | |
| "epoch": 9.954022988505747, | |
| "grad_norm": 18.068040151493758, | |
| "kl": 0.052001953125, | |
| "learning_rate": 4.597701149425287e-09, | |
| "loss": 0.0022, | |
| "reward": 1.910335898399353, | |
| "reward_std": 0.018777839839458466, | |
| "rewards/accuracy_reward": 0.9103360176086426, | |
| "rewards/format_reward": 1.0, | |
| "step": 866 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.146484375, | |
| "epoch": 9.96551724137931, | |
| "grad_norm": 4.0533200337142565, | |
| "kl": 0.046142578125, | |
| "learning_rate": 3.4482758620689654e-09, | |
| "loss": 0.0019, | |
| "reward": 1.9074934720993042, | |
| "reward_std": 0.02056286484003067, | |
| "rewards/accuracy_reward": 0.9074934720993042, | |
| "rewards/format_reward": 1.0, | |
| "step": 867 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.169921875, | |
| "epoch": 9.977011494252874, | |
| "grad_norm": 2.988025803486446, | |
| "kl": 0.04833984375, | |
| "learning_rate": 2.2988505747126436e-09, | |
| "loss": 0.002, | |
| "reward": 1.9082448482513428, | |
| "reward_std": 0.017992818728089333, | |
| "rewards/accuracy_reward": 0.908244788646698, | |
| "rewards/format_reward": 1.0, | |
| "step": 868 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.54622650146484, | |
| "epoch": 9.988505747126437, | |
| "grad_norm": 1.9804111140556449, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.1494252873563218e-09, | |
| "loss": 0.0018, | |
| "reward": 1.898494005203247, | |
| "reward_std": 0.01871921308338642, | |
| "rewards/accuracy_reward": 0.8984940052032471, | |
| "rewards/format_reward": 1.0, | |
| "step": 869 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 98.9199447631836, | |
| "epoch": 10.0, | |
| "grad_norm": 3.2087576817168033, | |
| "kl": 0.041015625, | |
| "learning_rate": 0.0, | |
| "loss": 0.0017, | |
| "reward": 1.9117414951324463, | |
| "reward_std": 0.016582123935222626, | |
| "rewards/accuracy_reward": 0.9117417335510254, | |
| "rewards/format_reward": 1.0, | |
| "step": 870 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 870, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |