fix agents and envs

kylemontgomery1 · kylemontgomery1 · commit 8fa061cb2d86 · 2025-08-20T09:13:08.000-05:00
diff --git a/rllm/agents/code_agent.py b/rllm/agents/code_agent.py
@@ -23,17 +23,17 @@ class CompetitionCodingAgent(BaseAgent):
     A code agent that iteratively writes code to solve a problem.
     """
 
-    def __init__(self, remove_thinking=False, max_tests=2, public_test_only=True):
+    def __init__(self, accumulate_thinking=False, max_tests=2, public_test_only=False):
         """
         Initialize the CodeAgent.
         """
         self.revise_instruction = "Here's the feedback from the previous attempt. Revise the code to fix the errors and improve the solution."
         self._trajectory = Trajectory()
         self.messages = []
-        self.remove_thinking = remove_thinking
+        self.accumulate_thinking = accumulate_thinking
+
         self.max_tests = max_tests
         self.public_test_only = public_test_only
-        self.current_observation = None
 
     def format_test_results(self, test_results: list[dict]) -> str:
         def normalize_string(s):
@@ -102,46 +102,63 @@ def update_from_env(self, observation: Any, reward: float, done: bool, info: dic
             else:
                 formatted_observation = str(observation)
 
+        # Update reward on the latest step
+        if self.trajectory.steps:
+            cur_step = self.get_current_state()
+            cur_step.reward = reward
+            cur_step.done = done
+            cur_step.info = info
+
         if done:
             return
 
         self.messages.append({"role": "user", "content": formatted_observation})
-        self.current_observation = formatted_observation
+
+        new_step = Step(observation=formatted_observation)
+        self._trajectory.steps.append(new_step)
 
     def update_from_model(self, response: str, **kwargs) -> Action:
         """
         Updates the agent's internal state based on the model's response.
         """
-        content = response
-        action = response
-
-        # Handle thinking removal if needed
-        if self.remove_thinking and content.count("</think>") == 1:
-            thought, action = response.split("</think>")
-            thought += "</think>"
-            action = action.strip()
-            self.messages.append({"role": "assistant", "content": action})
+        self.messages.append({"role": "assistant", "content": response})
+
+        cur_step = self.get_current_state()
+        cur_step.chat_completions = self.chat_completions
+        cur_step.model_response = response
+
+        if response.count("</think>") == 1:
+            thought, sep, action = response.partition("</think>")
+            thought = thought + sep
+            action = Action(action.strip())
         else:
-            self.messages.append({"role": "assistant", "content": response})
+            thought = None
+            action = Action(response.strip())
 
-        # Create new step
-        new_step = Step(chat_completions=copy.deepcopy(self.chat_completions), action=action, model_response=response, observation=self.current_observation)
-        self._trajectory.steps.append(new_step)
+        cur_step.thought = thought
+        cur_step.action = action
 
-        return Action(action=action)
+        return action
 
     def reset(self):
         """
         Resets the agent's internal state for a new episode.
         """
         self._trajectory = Trajectory()
         self.messages = []
-        self.current_observation = None
 
     @property
     def chat_completions(self) -> list[dict[str, str]]:
-        """Returns the history of messages for chat completion."""
-        return self.messages
+        """Return conversation history for model interaction."""
+        # remove thinking from assistant messages if not accumulate_thinking except the last one
+        messages = copy.deepcopy(self.messages)
+        if not self.accumulate_thinking:
+            for msg in messages[:-1]:
+                if msg["role"] == "assistant":
+                    _, sep, after = msg["content"].partition("</think>")
+                    if sep:
+                        msg["content"] = after
+        return messages
 
     @property
     def trajectory(self) -> Trajectory:
diff --git a/rllm/agents/math_agent.py b/rllm/agents/math_agent.py
@@ -13,38 +13,62 @@ def __init__(self, accumulate_thinking=True):
         """
         Initialize the MathAgent.
         """
-        self.instruction = "Let's think step by step, and put your final answer within \\boxed{}."
         self._trajectory = Trajectory()
         self.messages = []
         self.accumulate_thinking = accumulate_thinking
 
     def update_from_env(self, observation: Any, reward: float, done: bool, info: dict, **kwargs):
         """Process environment feedback and update internal state."""
 
-        # Format observation based on whether it's the initial problem or subsequent feedback
-        if not self.trajectory.steps:
-            # Initial problem presentation
-            assert isinstance(observation, dict) and "question" in observation
-            question = observation["question"]
-            formatted_observation = f"{question} {self.instruction}"
+        # If observation is None, this is a reward update for the existing step
+        if observation is None:
+            if self.trajectory.steps:
+                cur_step = self.get_current_state()
+                cur_step.reward = reward
+                cur_step.done = done
+                cur_step.info = info
+            return
+
+        # This is a new observation, create a new step
+        if isinstance(observation, dict):
+            formatted_observation = observation["question"]
+        elif isinstance(observation, str):
+            formatted_observation = observation
         else:
-            # Follow-up correction prompt
-            formatted_observation = "Your previous answer may contain a mistake. Please review it carefully and answer again. Put your final answer within \\boxed{}."
+            raise ValueError(f"Invalid observation type: {type(observation)}")
 
         self.messages.append({"role": "user", "content": formatted_observation})
 
+        new_step = Step(observation=formatted_observation)
+        self._trajectory.steps.append(new_step)
+
     def update_from_model(self, response: str, **kwargs) -> Action:
         """
         Updates the agent's internal state based on the model's response.
         """
+
+        # Update the latest step
         self.messages.append({"role": "assistant", "content": response})
-        new_step = Step(chat_completions=copy.deepcopy(self.chat_completions))
-        self.trajectory.steps.append(new_step)
 
-        return Action(action=response)
+        cur_step = self.get_current_state()
+        cur_step.chat_completions = self.chat_completions
+        cur_step.model_response = response
+
+        if response.count("</think>") == 1:
+            thought, sep, action = response.partition("</think>")
+            thought = thought + sep
+            action = Action(action.strip())
+        else:
+            thought = None
+            action = Action(response.strip())
+
+        cur_step.thought = thought
+        cur_step.action = action
+
+        return action
 
-    def reset(self):
-        """Reset agent state for new episode."""
+    def reset(self) -> None:
+        """Reset agent state for new episode (wipes trajectory and messages)."""
         self._trajectory = Trajectory()
         self.messages = []
 
diff --git a/rllm/environments/base/multi_turn_env.py b/rllm/environments/base/multi_turn_env.py
@@ -25,9 +25,13 @@ def __init__(self, task: dict | None = None, max_turns: int = 3, **kwargs):
         self.max_turns = max_turns
         self.current_turn = 0
         self.done = False
-        self.history: list[Any] = []
+        self.history = []
+
+    def reset(self, task: dict | None = None):
+        # Use the provided task if available, otherwise use the default task
+        if task is not None:
+            self.task = task
 
-    def reset(self):
         self.done = False
         self.current_turn = 0
         self.history = []
diff --git a/rllm/environments/code/competition_coding.py b/rllm/environments/code/competition_coding.py
@@ -40,7 +40,7 @@ def reset(self, task=None, seed=None):
         self.prev_reward = None
 
         # Return the first question
-        return {"question": self.task["question"]}, {}
+        return self.task, {}
 
     def step(self, action):
         """
diff --git a/rllm/rewards/countdown_reward.py b/rllm/rewards/countdown_reward.py
@@ -1,6 +1,7 @@
 import random
 import re
 
+from rllm import Action
 from rllm.rewards.reward_types import RewardOutput
 
 
@@ -109,7 +110,7 @@ def compute_score(solution_str, ground_truth, method="strict", format_score=0.1,
         return format_score
 
 
-def countdown_reward_fn(task_info: dict, action: str) -> RewardOutput:
+def countdown_reward_fn(task_info: dict, action: str | Action) -> RewardOutput:
     """
     A specialized reward function for countdown tasks using the compute_score helper.
 
@@ -124,6 +125,9 @@ def countdown_reward_fn(task_info: dict, action: str) -> RewardOutput:
         RewardOutput with reward and metadata
     """
     try:
+        if isinstance(action, Action):
+            action = action.action
+
         # Extract basic info
         target = task_info.get("target")
         nums = task_info.get("nums", [])
diff --git a/rllm/rewards/reward_fn.py b/rllm/rewards/reward_fn.py
@@ -1,5 +1,6 @@
 from typing import Protocol, runtime_checkable
 
+from rllm.agents.agent import Action
 from rllm.rewards.code_reward import RewardCodeFn
 from rllm.rewards.math_reward import RewardMathFn
 from rllm.rewards.reward_types import RewardConfig, RewardInput, RewardOutput
@@ -53,6 +54,8 @@ def math_reward_fn(task_info: dict, action: str) -> RewardOutput:
     """
     reward_config = RewardConfig()
     reward_fn = RewardMathFn(reward_config)
+    if isinstance(action, Action):
+        action = action.action
     return reward_fn(task_info, action)
 
 
@@ -69,6 +72,8 @@ def search_reward_fn(task_info: dict, action: str) -> RewardOutput:
     """
     reward_config = RewardConfig()
     reward_fn = RewardSearchFn(reward_config)
+    if isinstance(action, Action):
+        action = action.action
 
     # Create RewardInput from task_info and action
     reward_input = RewardInput(task_info=task_info, action=action)
@@ -89,4 +94,6 @@ def code_reward_fn(task_info: dict, action: str) -> RewardOutput:
     """
     reward_config = RewardConfig()
     reward_fn = RewardCodeFn(reward_config)
+    if isinstance(action, Action):
+        action = action.action
     return reward_fn(task_info, action)