Skip to content

Commit 40ba2bf

Browse files
update examples
1 parent 502ffbf commit 40ba2bf

22 files changed

+114
-156
lines changed

examples/deepcoder/train_deepcoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from rllm.trainer.agent_trainer import AgentTrainer
88

99

10-
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="ppo_trainer", version_base=None)
10+
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
1111
def main(config):
1212
train_dataset = DatasetRegistry.load_dataset("deepcoder", "train")
1313
test_dataset = DatasetRegistry.load_dataset("deepcoder", "test")

examples/deepcoder/train_deepcoder_16k.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ python3 -m examples.deepcoder.train_deepcoder \
4343
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
4444
actor_rollout_ref.rollout.name=vllm \
4545
actor_rollout_ref.rollout.mode="async" \
46-
actor_rollout_ref.rollout.chat_scheduler=verl.schedulers.completions_scheduler.CompletionsScheduler \
4746
actor_rollout_ref.rollout.enforce_eager=False \
4847
actor_rollout_ref.rollout.temperature=0.6 \
4948
actor_rollout_ref.rollout.top_p=0.95 \
@@ -56,8 +55,7 @@ python3 -m examples.deepcoder.train_deepcoder \
5655
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
5756
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
5857
algorithm.kl_ctrl.kl_coef=0.001 \
59-
algorithm.mask_truncated_samples=True \
60-
algorithm.clip_advantages=False \
58+
rllm.mask_truncated_samples=True \
6159
trainer.critic_warmup=0 \
6260
trainer.logger=['console','wandb'] \
6361
trainer.project_name='rllm-deepcoder' \
@@ -68,6 +66,6 @@ python3 -m examples.deepcoder.train_deepcoder \
6866
trainer.save_freq=10 \
6967
trainer.test_freq=10 \
7068
trainer.default_hdfs_dir=null \
71-
agent.max_steps=1 \
72-
agent.use_stepwise_advantage=False \
69+
rllm.agent.max_steps=1 \
70+
rllm.stepwise_advantage.enable=False \
7371
trainer.total_epochs=100

examples/deepcoder/train_deepcoder_32k.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ python3 -m examples.deepcoder.train_deepcoder \
4343
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
4444
actor_rollout_ref.rollout.name=vllm \
4545
actor_rollout_ref.rollout.mode="async" \
46-
actor_rollout_ref.rollout.chat_scheduler=verl.schedulers.completions_scheduler.CompletionsScheduler \
4746
actor_rollout_ref.rollout.enforce_eager=False \
4847
actor_rollout_ref.rollout.temperature=0.6 \
4948
actor_rollout_ref.rollout.top_p=0.95 \
@@ -56,8 +55,7 @@ python3 -m examples.deepcoder.train_deepcoder \
5655
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
5756
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
5857
algorithm.kl_ctrl.kl_coef=0.001 \
59-
algorithm.mask_truncated_samples=False \
60-
algorithm.clip_advantages=False \
58+
rllm.mask_truncated_samples=False \
6159
trainer.critic_warmup=0 \
6260
trainer.logger=['console','wandb'] \
6361
trainer.project_name='rllm-deepcoder' \
@@ -68,6 +66,6 @@ python3 -m examples.deepcoder.train_deepcoder \
6866
trainer.save_freq=10 \
6967
trainer.test_freq=10 \
7068
trainer.default_hdfs_dir=null \
71-
agent.max_steps=1 \
72-
agent.use_stepwise_advantage=False \
69+
rllm.agent.max_steps=1 \
70+
rllm.stepwise_advantage.enable=False \
7371
trainer.total_epochs=100

examples/deepscaler/train_deepscaler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from rllm.trainer.agent_trainer import AgentTrainer
88

99

10-
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="ppo_trainer", version_base=None)
10+
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
1111
def main(config):
1212
train_dataset = DatasetRegistry.load_dataset("deepscaler_math", "train")
1313
test_dataset = DatasetRegistry.load_dataset("aime2024", "test")

examples/deepscaler/train_deepscaler_16k.sh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,12 @@ python3 -m examples.deepscaler.train_deepscaler \
3030
actor_rollout_ref.actor.kl_loss_coef=0.001 \
3131
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
3232
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
33-
actor_rollout_ref.actor.grad_norm_threshold=10 \
3433
actor_rollout_ref.model.enable_gradient_checkpointing=True \
3534
actor_rollout_ref.actor.fsdp_config.param_offload=True \
3635
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
3736
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
3837
actor_rollout_ref.rollout.name=vllm \
3938
actor_rollout_ref.rollout.mode="async" \
40-
actor_rollout_ref.rollout.chat_scheduler=verl.schedulers.completions_scheduler.CompletionsScheduler \
4139
actor_rollout_ref.rollout.enforce_eager=False \
4240
actor_rollout_ref.rollout.temperature=0.6 \
4341
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
@@ -50,8 +48,7 @@ python3 -m examples.deepscaler.train_deepscaler \
5048
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
5149
actor_rollout_ref.actor.entropy_coeff=0 \
5250
algorithm.kl_ctrl.kl_coef=0.001 \
53-
algorithm.mask_truncated_samples=False \
54-
algorithm.clip_advantages=False \
51+
rllm.mask_truncated_samples=False \
5552
trainer.critic_warmup=0 \
5653
trainer.logger=['console','wandb'] \
5754
trainer.project_name='rllm-agent' \
@@ -62,6 +59,6 @@ python3 -m examples.deepscaler.train_deepscaler \
6259
trainer.save_freq=20 \
6360
trainer.test_freq=20 \
6461
trainer.default_hdfs_dir=null \
65-
agent.max_steps=1 \
66-
agent.use_stepwise_advantage=False \
62+
rllm.agent.max_steps=1 \
63+
rllm.stepwise_advantage.enable=False \
6764
trainer.total_epochs=100

examples/deepscaler/train_deepscaler_24k.sh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,12 @@ python3 -m examples.deepscaler.train_deepscaler \
3030
actor_rollout_ref.actor.kl_loss_coef=0.001 \
3131
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
3232
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
33-
actor_rollout_ref.actor.grad_norm_threshold=10 \
3433
actor_rollout_ref.model.enable_gradient_checkpointing=True \
3534
actor_rollout_ref.actor.fsdp_config.param_offload=True \
3635
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
3736
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
3837
actor_rollout_ref.rollout.name=vllm \
3938
actor_rollout_ref.rollout.mode="async" \
40-
actor_rollout_ref.rollout.chat_scheduler=verl.schedulers.completions_scheduler.CompletionsScheduler \
4139
actor_rollout_ref.rollout.enforce_eager=False \
4240
actor_rollout_ref.rollout.temperature=0.6 \
4341
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
@@ -50,8 +48,7 @@ python3 -m examples.deepscaler.train_deepscaler \
5048
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
5149
actor_rollout_ref.actor.entropy_coeff=0 \
5250
algorithm.kl_ctrl.kl_coef=0.001 \
53-
algorithm.mask_truncated_samples=False \
54-
algorithm.clip_advantages=False \
51+
rllm.mask_truncated_samples=False \
5552
trainer.critic_warmup=0 \
5653
trainer.logger=['console','wandb'] \
5754
trainer.project_name='rllm-agent' \
@@ -62,6 +59,6 @@ python3 -m examples.deepscaler.train_deepscaler \
6259
trainer.save_freq=20 \
6360
trainer.test_freq=20 \
6461
trainer.default_hdfs_dir=null \
65-
agent.max_steps=1 \
66-
agent.use_stepwise_advantage=False \
62+
rllm.agent.max_steps=1 \
63+
rllm.stepwise_advantage.enable=False \
6764
trainer.total_epochs=100

examples/deepscaler/train_deepscaler_8k.sh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,12 @@ python3 -m examples.deepscaler.train_deepscaler \
3030
actor_rollout_ref.actor.kl_loss_coef=0.001 \
3131
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
3232
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
33-
actor_rollout_ref.actor.grad_norm_threshold=10 \
3433
actor_rollout_ref.model.enable_gradient_checkpointing=True \
3534
actor_rollout_ref.actor.fsdp_config.param_offload=True \
3635
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
3736
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
3837
actor_rollout_ref.rollout.name=vllm \
3938
actor_rollout_ref.rollout.mode="async" \
40-
actor_rollout_ref.rollout.chat_scheduler=verl.schedulers.completions_scheduler.CompletionsScheduler \
4139
actor_rollout_ref.rollout.enforce_eager=False \
4240
actor_rollout_ref.rollout.temperature=0.6 \
4341
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
@@ -50,8 +48,7 @@ python3 -m examples.deepscaler.train_deepscaler \
5048
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
5149
actor_rollout_ref.actor.entropy_coeff=0 \
5250
algorithm.kl_ctrl.kl_coef=0.001 \
53-
algorithm.mask_truncated_samples=False \
54-
algorithm.clip_advantages=False \
51+
rllm.mask_truncated_samples=False \
5552
trainer.critic_warmup=0 \
5653
trainer.logger=['console','wandb'] \
5754
trainer.project_name='rllm-agent' \
@@ -62,6 +59,6 @@ python3 -m examples.deepscaler.train_deepscaler \
6259
trainer.save_freq=20 \
6360
trainer.test_freq=20 \
6461
trainer.default_hdfs_dir=null \
65-
agent.max_steps=1 \
66-
agent.use_stepwise_advantage=False \
62+
rllm.agent.max_steps=1 \
63+
rllm.stepwise_advantage.enable=False \
6764
trainer.total_epochs=100

examples/frozenlake/train_frozenlake_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from rllm.trainer.agent_trainer import AgentTrainer
77

88

9-
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="ppo_trainer", version_base=None)
9+
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
1010
def main(config):
1111
train_dataset = DatasetRegistry.load_dataset("frozenlake", "train")
1212
val_dataset = DatasetRegistry.load_dataset("frozenlake", "test")

examples/frozenlake/train_frozenlake_agent.sh

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,12 @@ python3 -m examples.frozenlake.train_frozenlake_agent \
2828
actor_rollout_ref.actor.kl_loss_coef=0.001 \
2929
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
3030
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
31-
actor_rollout_ref.actor.grad_norm_threshold=10 \
3231
actor_rollout_ref.model.enable_gradient_checkpointing=True \
3332
actor_rollout_ref.actor.fsdp_config.param_offload=True \
3433
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
3534
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
3635
actor_rollout_ref.rollout.name=vllm \
3736
actor_rollout_ref.rollout.mode="async" \
38-
actor_rollout_ref.rollout.chat_scheduler=verl.schedulers.completions_scheduler.CompletionsScheduler \
3937
actor_rollout_ref.rollout.enforce_eager=False \
4038
actor_rollout_ref.rollout.temperature=0.7 \
4139
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
@@ -49,8 +47,7 @@ python3 -m examples.frozenlake.train_frozenlake_agent \
4947
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
5048
actor_rollout_ref.actor.entropy_coeff=0 \
5149
algorithm.kl_ctrl.kl_coef=0.001 \
52-
algorithm.mask_truncated_samples=False \
53-
algorithm.clip_advantages=False \
50+
rllm.mask_truncated_samples=False \
5451
trainer.critic_warmup=0 \
5552
trainer.logger=['console','wandb'] \
5653
trainer.project_name='rllm-agent' \
@@ -61,14 +58,13 @@ python3 -m examples.frozenlake.train_frozenlake_agent \
6158
trainer.save_freq=40 \
6259
trainer.test_freq=10 \
6360
trainer.default_hdfs_dir=null \
64-
trainer.rejection_sample=True \
65-
trainer.rejection_sample_multiplier=2 \
66-
+env.env_args.max_steps=8 \
67-
+env.env_args.is_slippery=False \
68-
agent.max_steps=10 \
69-
agent.async_engine=True \
70-
agent.use_stepwise_advantage=False \
71-
+agent.engine_args.disable_thinking=False \
72-
+agent.agent_args.max_steps=10 \
73-
+agent.agent_args.use_accumulate_history=True \
61+
rllm.rejection_sample.enable=True \
62+
rllm.rejection_sample.multiplier=2 \
63+
+rllm.env.env_args.max_steps=8 \
64+
+rllm.env.env_args.is_slippery=False \
65+
rllm.agent.max_steps=10 \
66+
rllm.stepwise_advantage.enable=False \
67+
rllm.disable_thinking=False \
68+
+rllm.agent.agent_args.max_steps=10 \
69+
+rllm.agent.agent_args.use_accumulate_history=True \
7470
trainer.total_epochs=1

examples/math_tool/train_math_with_tool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from rllm.trainer.agent_trainer import AgentTrainer
88

99

10-
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="ppo_trainer", version_base=None)
10+
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
1111
def main(config):
1212
train_dataset = DatasetRegistry.load_dataset("deepscaler_math", "train")
1313
test_dataset = DatasetRegistry.load_dataset("aime2024", "test")

0 commit comments

Comments
 (0)