fix ppo trainer, remove gpu dependencies

kylemontgomery1 · kylemontgomery1 · commit 1b393de39a1e · 2025-08-20T15:09:12.000-07:00
diff --git a/Dockerfile b/Dockerfile
@@ -4,13 +4,9 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 WORKDIR /workspace
 
-RUN pip uninstall verl -y || true
+RUN git clone https://github.com/rllm-org/rllm.git rllm
 
-RUN git clone --recurse-submodules https://github.com/rllm-org/rllm.git rllm
-
-RUN cd rllm && \
-    pip install -e ./verl && \
-    pip install -e .
+RUN cd rllm && pip install -e .
 
 RUN pip install playwright && \
     playwright install chromium && \
diff --git a/README.md b/README.md
@@ -56,8 +56,6 @@ bash scripts/install_verl.sh # (or follow the instructions at https://verl.readt
 
 # Install rllm
 pip install -e .
-
-**Note:** On macOS, GPU features (flash-attn, deepspeed, vllm) are automatically excluded for compatibility. For GPU support on macOS, you can install with: `pip install -e .[gpu]`
 ```
 
 ### Installation with Docker 🐳
diff --git a/examples/solver_judge/train_solver_judge_flow.py b/examples/solver_judge/train_solver_judge_flow.py
@@ -1,8 +1,8 @@
 import hydra
 
-from examples.countdown.countdown_reward import countdown_reward_fn
 from examples.solver_judge.solver_judge_flow import SolverJudgeWorkflow
 from rllm.data.dataset import DatasetRegistry
+from rllm.rewards.countdown_reward import countdown_reward_fn
 from rllm.trainer.agent_trainer import AgentTrainer
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -89,11 +89,6 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-gpu = [
-    "flash-attn>=2.7.4.post1; sys_platform != 'darwin'",
-    "vllm>=0.8.3; sys_platform != 'darwin'",
-    "sglang>=0.4.6.post1; sys_platform != 'darwin'",
-]
 
 smolagents = [
     "smolagents==1.20.0",
diff --git a/rllm/trainer/agent_trainer.py b/rllm/trainer/agent_trainer.py
@@ -1,9 +1,10 @@
 from typing import Any
 
 import ray
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 
 from rllm.data import Dataset
-from rllm.trainer.verl.train_agent_ppo import train_agent
+from rllm.trainer.verl.train_agent_ppo import TaskRunner
 
 
 class AgentTrainer:
@@ -55,10 +56,12 @@ def __init__(
 
     def train(self):
         if not ray.is_initialized():
-            ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
+            ray.init(runtime_env=get_ppo_ray_runtime_env(), num_cpus=self.config.ray_init.num_cpus)
+
+        runner = TaskRunner.remote()
 
         ray.get(
-            train_agent.remote(
+            runner.run.remote(
                 config=self.config,
                 workflow_class=self.workflow_class,
                 workflow_args=self.workflow_args,
diff --git a/rllm/trainer/verl/train_agent_ppo.py b/rllm/trainer/verl/train_agent_ppo.py
@@ -2,9 +2,12 @@
 """
 Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
 """
+import os
+import socket
 
 import hydra
 import ray
+from omegaconf import OmegaConf
 from verl.trainer.ppo.reward import load_reward_manager
 
 from rllm.trainer.env_agent_mappings import AGENT_CLASS_MAPPING, ENV_CLASS_MAPPING, WORKFLOW_CLASS_MAPPING
@@ -20,121 +23,206 @@ def main(config):
 
 
 def run_ppo_agent(config):
+    # Check if Ray is not initialized
     if not ray.is_initialized():
-        # this is for local ray cluster
-        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        ray.init(
+            runtime_env=get_ppo_ray_runtime_env(),
+            num_cpus=config.ray_init.num_cpus,
+        )
+
+    # Create a remote instance of the TaskRunner class, and
+    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
+    if (
+        is_cuda_available
+        and config.trainer.get("profile_steps") is not None
+        and len(config.trainer.get("profile_steps", [])) > 0
+    ):
+        nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
+        runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
+    else:
+        runner = TaskRunner.remote()
+    ray.get(runner.run.remote(config))
 
-    ray.get(train_agent.remote(config))
+    # [Optional] get the path of the timeline trace file from the configuration, default to None
+    # This file is used for performance analysis
+    timeline_json_file = config.ray_init.get("timeline_json_file", None)
+    if timeline_json_file:
+        ray.timeline(filename=timeline_json_file)
 
 
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
-def train_agent(config, workflow_class=None, workflow_args=None, agent_class=None, env_class=None, agent_args=None, env_args=None):
-    # print initial config
-    from pprint import pprint
+class TaskRunner:
+    """Ray remote class for executing distributed PPO training tasks.
 
-    from omegaconf import OmegaConf
-    from verl.utils.fs import copy_local_path_from_hdfs
+    This class encapsulates the main training logic and runs as a Ray remote actor
+    to enable distributed execution across multiple nodes and GPUs.
+    """
 
-    OmegaConf.register_new_resolver("mul", lambda x, y: int(x) * int(y))
-    OmegaConf.resolve(config)
-    pprint(OmegaConf.to_container(config))
+    def run(self, config, workflow_class=None, workflow_args=None, agent_class=None, env_class=None, agent_args=None, env_args=None):
+        """Execute the main PPO training workflow.
 
-    # download the checkpoint from hdfs
-    local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+        This method sets up the distributed training environment, initializes
+        workers, datasets, and reward functions, then starts the training process.
 
-    # instantiate tokenizer
-    from verl.utils import hf_tokenizer
+        Args:
+            config: Training configuration object containing all parameters needed
+                   for setting up and running the PPO training process.
+        """
+        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
+        from pprint import pprint
 
-    trust_remote_code = config.data.get("trust_remote_code", False)
-    tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
-    # processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
+        from omegaconf import OmegaConf
 
-    if config.actor_rollout_ref.actor.strategy in ["fsdp", "fsdp2"]:
-        assert config.critic.strategy in ["fsdp", "fsdp2"]
-        from verl.single_controller.ray import RayWorkerGroup
-        from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+        from verl.utils.fs import copy_to_local
 
-        actor_rollout_cls = AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
-        ray_worker_group_cls = RayWorkerGroup
-    else:
-        raise NotImplementedError
-
-    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
-
-    role_worker_mapping = {
-        Role.ActorRollout: ray.remote(max_concurrency=2048)(actor_rollout_cls),
-        Role.Critic: ray.remote(CriticWorker),
-    }
-
-    global_pool_id = "global_pool"
-    resource_pool_spec = {
-        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
-    }
-    mapping = {
-        Role.ActorRollout: global_pool_id,
-        Role.Critic: global_pool_id,
-    }
-
-    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-        role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
-        mapping[Role.RefPolicy] = global_pool_id
-
-    reward_fn = load_reward_manager(config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}))
-    val_reward_fn = load_reward_manager(config, tokenizer, num_examine=1)
-    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
-
-    if config.rllm.workflow.use_workflow:
-        if workflow_class is None:
-            workflow_class = WORKFLOW_CLASS_MAPPING[config.rllm.workflow.name]
-        workflow_args = workflow_args or {}
-        if config.rllm.workflow.get("workflow_args") is not None:
-            workflow_args.update(config.rllm.workflow.get("workflow_args"))
-
-        trainer = AgentWorkflowPPOTrainer(
-            config=config,
-            tokenizer=tokenizer,
-            role_worker_mapping=role_worker_mapping,
-            resource_pool_manager=resource_pool_manager,
-            ray_worker_group_cls=ray_worker_group_cls,
-            reward_fn=reward_fn,
-            val_reward_fn=val_reward_fn,
-            workflow_class=workflow_class,
-            workflow_args=workflow_args,
-        )
+        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
+        OmegaConf.register_new_resolver("mul", lambda x, y: int(x) * int(y))
+        OmegaConf.resolve(config)
+        pprint(OmegaConf.to_container(config))
 
-    else:
-        if env_class is None:
-            env_class = ENV_CLASS_MAPPING[config.rllm.env.name]
-        if agent_class is None:
-            agent_class = AGENT_CLASS_MAPPING[config.rllm.agent.name]
-
-        env_args = env_args or {}
-        agent_args = agent_args or {}
-        if config.rllm.env.get("env_args") is not None:
-            env_args.update(config.rllm.env.get("env_args"))
-        if config.rllm.agent.get("agent_args") is not None:
-            agent_args.update(config.rllm.agent.get("agent_args"))
-
-        trainer = AgentPPOTrainer(
-            config=config,
-            tokenizer=tokenizer,
-            role_worker_mapping=role_worker_mapping,
-            resource_pool_manager=resource_pool_manager,
-            ray_worker_group_cls=ray_worker_group_cls,
-            reward_fn=reward_fn,
-            val_reward_fn=val_reward_fn,
-            env_class=env_class,
-            agent_class=agent_class,
-            env_args=env_args,
-            agent_args=agent_args,
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(
+            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
         )
 
-    trainer.init_workers()
-    try:
-        trainer.fit_agent()
-    finally:
-        trainer.shutdown()
-
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        # processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+
+        # Define worker classes based on the actor strategy.
+        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+            assert config.critic.strategy in {"fsdp", "fsdp2"}
+            from verl.single_controller.ray import RayWorkerGroup
+            from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
+
+            use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+            if use_legacy_worker_impl in ["auto", "enable"]:
+                # import warnings
+                # warnings.warn(f"Legacy worker impl is going to be deprecated, will be removed in the future. \
+                #   Please set trainer.use_legacy_worker_impl = false to switch to the new worker implementation.")
+                from verl.workers.fsdp_workers import CriticWorker
+            elif use_legacy_worker_impl == "disable":
+                from verl.workers.roles import CriticWorker
+
+                print("Using new worker implementation")
+            else:
+                raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}")
+
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = RayWorkerGroup
+
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+            from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+        else:
+            raise NotImplementedError
+
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+        # Map roles to their corresponding remote worker classes.
+        role_worker_mapping = {
+            Role.ActorRollout: ray.remote(actor_rollout_cls),
+            Role.Critic: ray.remote(CriticWorker),
+        }
+
+        # Define the resource pool specification.
+        # Map roles to the resource pool.
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        mapping = {
+            Role.ActorRollout: global_pool_id,
+            Role.Critic: global_pool_id,
+        }
+
+        # Add a reference policy worker if KL loss or KL reward is used.
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            mapping[Role.RefPolicy] = global_pool_id
+
+        # Load the reward manager for training and validation.
+        reward_fn = load_reward_manager(
+            config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
+        )
+        val_reward_fn = load_reward_manager(
+            config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {})
+        )
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+        if config.rllm.workflow.use_workflow:
+            if workflow_class is None:
+                workflow_class = WORKFLOW_CLASS_MAPPING[config.rllm.workflow.name]
+            workflow_args = workflow_args or {}
+            if config.rllm.workflow.get("workflow_args") is not None:
+                workflow_args.update(config.rllm.workflow.get("workflow_args"))
+
+            trainer = AgentWorkflowPPOTrainer(
+                config=config,
+                tokenizer=tokenizer,
+                role_worker_mapping=role_worker_mapping,
+                resource_pool_manager=resource_pool_manager,
+                ray_worker_group_cls=ray_worker_group_cls,
+                reward_fn=reward_fn,
+                val_reward_fn=val_reward_fn,
+                workflow_class=workflow_class,
+                workflow_args=workflow_args,
+            )
+
+        else:
+            if env_class is None:
+                env_class = ENV_CLASS_MAPPING[config.rllm.env.name]
+            if agent_class is None:
+                agent_class = AGENT_CLASS_MAPPING[config.rllm.agent.name]
+
+            env_args = env_args or {}
+            agent_args = agent_args or {}
+            if config.rllm.env.get("env_args") is not None:
+                env_args.update(config.rllm.env.get("env_args"))
+            if config.rllm.agent.get("agent_args") is not None:
+                agent_args.update(config.rllm.agent.get("agent_args"))
+
+            trainer = AgentPPOTrainer(
+                config=config,
+                tokenizer=tokenizer,
+                role_worker_mapping=role_worker_mapping,
+                resource_pool_manager=resource_pool_manager,
+                ray_worker_group_cls=ray_worker_group_cls,
+                reward_fn=reward_fn,
+                val_reward_fn=val_reward_fn,
+                env_class=env_class,
+                agent_class=agent_class,
+                env_args=env_args,
+                agent_args=agent_args,
+            )
+
+        trainer.init_workers()
+        try:
+            trainer.fit_agent()
+        finally:
+            trainer.shutdown()
 
 if __name__ == "__main__":
     main()