patched-codes · whoisarpit · Mar 2, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -3,8 +3,7 @@ name: Build
 on:
   push:
     tags:
-      - 'v*.*.*'
-
+      - "v*.*.*"
 
 jobs:
   publish-to-pypi:
@@ -18,7 +17,7 @@ jobs:
         id: setup-python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: "3.11"
 
       - name: Install Poetry
         uses: snok/install-poetry@v1

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -40,7 +40,7 @@ jobs:
         id: setup-python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: "3.11"
 
       - name: Install Poetry
         uses: snok/install-poetry@v1
@@ -78,7 +78,7 @@ jobs:
         id: setup-python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: "3.11"
 
       - name: Install Poetry
         uses: snok/install-poetry@v1
@@ -133,7 +133,7 @@ jobs:
         id: setup-python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: "3.11"
 
       - name: Install Poetry
         uses: snok/install-poetry@v1
@@ -171,7 +171,7 @@ jobs:
           --base_path=tests/cicd/generate_docstring \
           --disable_telemetry
 
-      - name : Generate Diagram
+      - name: Generate Diagram
         run: |
           source .venv/bin/activate
           patchwork GenerateDiagram --log debug \
@@ -188,7 +188,7 @@ jobs:
           --github_api_key=${{ secrets.SCM_GITHUB_KEY }} \
           --folder_path=tests/cicd/generate_docstring \
           --disable_telemetry
-      
+
       - name: Generate Code Usage Example
         run: |
           source .venv/bin/activate
@@ -204,15 +204,15 @@ jobs:
           # Specify the parent folder you want to check
           PARENT_FOLDER="./patchwork/steps"
           # Command to run if README.md is not found
-          
+
           find "$PARENT_FOLDER" -mindepth 1 -maxdepth 1 -type d | grep -vE '/\.\.?/' | grep -vE '/__' | while read -r dir; do
             if [[ ! -f "$dir/README.md" ]]; then
               echo "No README.md in $dir"
               # Extract the last part of the path to use as a base for the branch name
               base_name=$(basename "$dir")
               # Convert to a Git-friendly branch name: replace spaces with underscores, remove slashes, etc.
               branch_name=$(echo "$base_name" | sed -e 's/[^a-zA-Z0-9]/_/g' -e 's/__*/_/g' -e 's/^_//g' -e 's/_$//g')
-          
+
               patchwork GenerateREADME --log debug \
               --patched_api_key=${{ secrets.PATCHED_API_KEY }} \
               --github_api_key=${{ secrets.SCM_GITHUB_KEY }} \

diff --git a/patchwork/common/client/llm/google.py b/patchwork/common/client/llm/google.py
@@ -111,7 +111,7 @@ def is_model_supported(self, model: str) -> bool:
         return model in self.get_models()
 
     def __upload(self, file: Path | NotGiven) -> Part | File | None:
-        if file is NotGiven:
+        if isinstance(file, NotGiven):
             return None
 
         file_bytes = file.read_bytes()

diff --git a/patchwork/steps/BrowserUse/BrowserUse.py b/patchwork/steps/BrowserUse/BrowserUse.py
@@ -0,0 +1,239 @@
+import asyncio
+import logging
+import os
+from datetime import datetime
+
+from patchwork.step import Step
+from patchwork.steps import SimplifiedLLMOnce
+from patchwork.steps.BrowserUse.typed import BrowserUseInputs, BrowserUseOutputs
+
+logger = logging.getLogger(__name__)
+
+# Global variables to cache browser initialization
+_browser = None
+_controller = None
+
+
+def init_browser():
+    """
+    Initialize and cache browser and controller instances.
+
+    This function uses a singleton pattern to ensure we only create one browser
+    instance throughout the application lifecycle, which saves resources.
+
+    Returns:
+        tuple: (Browser, Controller) instances for web automation
+    """
+    global _browser, _controller
+
+    # Return cached instances if already initialized
+    if _browser is not None and _controller is not None:
+        return _browser, _controller
+
+    from browser_use import Browser, BrowserConfig, BrowserContextConfig, Controller
+    from browser_use.agent.views import ActionResult
+    from browser_use.browser.context import BrowserContext
+
+    # Set up downloads directory for browser operations
+    downloads_path = os.path.join(os.getcwd(), "downloads")
+    if not os.path.exists(downloads_path):
+        os.makedirs(downloads_path)
+
+    context_config = BrowserContextConfig(save_downloads_path=downloads_path)
+    config = BrowserConfig(
+        headless=True, disable_security=True, new_context_config=context_config
+    )
+    controller = Controller()
+
+    # Register custom action to upload files to web elements
+    @controller.action(
+        description="Upload file to interactive element with file path",
+    )
+    async def upload_file(index: int, path: str, browser: BrowserContext):
+        """
+        Upload a file to a file input element identified by its index.
+
+        Args:
+            index: The DOM element index to target
+            path: Local file path to upload
+            browser: Browser context for interaction
+
+        Returns:
+            ActionResult: Result of the upload operation
+        """
+        if not os.path.exists(path):
+            return ActionResult(error=f"File {path} does not exist")
+
+        dom_el = await browser.get_dom_element_by_index(index)
+        file_upload_dom_el = dom_el.get_file_upload_element()
+
+        if file_upload_dom_el is None:
+            msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
+            logger.info(msg)
+            return ActionResult(error=msg)
+
+        file_upload_el = await browser.get_locate_element(file_upload_dom_el)
+
+        if file_upload_el is None:
+            msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
+            logger.info(msg)
+            return ActionResult(error=msg)
+
+        try:
+            await file_upload_el.set_input_files(path)
+            msg = f"Successfully uploaded file to index {index}"
+            logger.info(msg)
+            return ActionResult(extracted_content=msg, include_in_memory=True)
+        except Exception as e:
+            msg = f"Failed to upload file to index {index}: {str(e)}"
+            logger.info(msg)
+            return ActionResult(error=msg)
+
+    # Register custom action to read file contents
+    @controller.action(description="Read the file content of a file given a path")
+    async def read_file(path: str):
+        """
+        Read and return the contents of a file at the specified path.
+
+        Args:
+            path: Path to the file to read
+
+        Returns:
+            ActionResult: File contents or error message
+        """
+        if not os.path.exists(path):
+            return ActionResult(error=f"File {path} does not exist")
+
+        with open(path, "r") as f:
+            content = f.read()
+        msg = f"File content: {content}"
+        logger.info(msg)
+        return ActionResult(extracted_content=msg, include_in_memory=True)
+
+    # Cache the initialized instances
+    _browser = Browser(config=config)
+    _controller = controller
+
+    return _browser, _controller
+
+
+class BrowserUse(Step, input_class=BrowserUseInputs, output_class=BrowserUseOutputs):
+    """
+    Step implementation for browser automation tasks.
+
+    This class provides a high-level interface for executing browser-based tasks
+    using various LLM providers (Google, OpenAI, Anthropic) to control the browser.
+    """
+    required_keys = {"task"}
+
+    def __init__(self, inputs):
+        """
+        Initialize the BrowserUse step with configuration inputs.
+
+        Args:
+            inputs: Dictionary containing configuration parameters (see: BrowserUseInputs)
+        """
+        super().__init__(inputs)
+
+        if not all(key in inputs.keys() for key in self.required_keys):
+            raise ValueError(f'Missing required data: "{self.required_keys}"')
+
+        # Configure the appropriate LLM based on provided API keys
+        if "google_api_key" in self.inputs:
+            from langchain_google_genai import ChatGoogleGenerativeAI
+
+            self.llm = ChatGoogleGenerativeAI(
+                model="gemini-2.0-flash", google_api_key=self.inputs["google_api_key"]
+            )
+        elif "openai_api_key" in self.inputs:
+            from langchain_openai import ChatOpenAI
+
+            self.llm = ChatOpenAI(model="gpt-4o", api_key=self.inputs["openai_api_key"])
+        elif "anthropic_api_key" in self.inputs:
+            from langchain_anthropic import ChatAnthropic
+
+            self.llm = ChatAnthropic(
+                model="claude-3-7-sonnet-latest",
+                api_key=self.inputs["anthropic_api_key"],
+            )
+
+        # Configure GIF generation for debugging/visualization
+        self.generate_gif = (
+            f"agent_history_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.gif"
+            if ("generate_gif" in self.inputs and self.inputs["generate_gif"])
+            or ("debug" in self.inputs and self.inputs["debug"])
+            else False
+        )
+
+    def run(self) -> dict:
+        """
+        Execute the browser automation task.
+
+        This method initializes the browser agent, runs the specified task,
+        and returns the results, optionally formatting them as JSON.
+
+        Returns:
+            dict: Results of the browser automation task
+        """
+        from browser_use import Agent
+
+        browser, controller = init_browser()
+        agent = Agent(
+            browser=browser,
+            controller=controller,
+            task=self.inputs["task"],
+            llm=self.llm,
+            generate_gif=self.generate_gif,
+            validate_output=True,
+        )
+
+        # Run the agent in an event loop
+        loop = asyncio.new_event_loop()
+        self.history = loop.run_until_complete(agent.run())
+
+        # Format results as JSON if schema provided
+        if "example_json" in self.inputs:
+            return self.__format_history_as_json()
+
+        return {
+            "history": self.history,
+            "result": self.history.final_result(),
+            "generated_gif": self.generate_gif,
+        }
+
+    def __format_history_as_json(self):
+        """
+        Format browser history as JSON using an LLM.
+
+        Uses the same LLM provider as the main task to convert
+        the browser history into a structured JSON format based
+        on the provided schema.
+
+        Returns:
+            dict: Formatted JSON result
+        """
+        inputs = dict(
+            user_prompt=f"""
+You are a helpful assistant that formats a history of browser actions and conversations into a JSON object.
+You are provided with a JSON schema for the history.
+Only include the JSON object in your response, nothing else.
+
+Here is the history:
+<history>
+{self.history.final_result()}
+</history>
+""",
+            json_schema=self.inputs["example_json"],
+            prompt_value=dict(),
+        )
+
+        if "google_api_key" in self.inputs:
+            inputs["google_api_key"] = self.inputs["google_api_key"]
+            inputs["model"] = "gemini-2.0-flash"
+        elif "openai_api_key" in self.inputs:
+            inputs["openai_api_key"] = self.inputs["openai_api_key"]
+            inputs["model"] = "gpt-4o-mini"
+        elif "anthropic_api_key" in self.inputs:
+            inputs["anthropic_api_key"] = self.inputs["anthropic_api_key"]
+            inputs["model"] = "claude-3-5-haiku-latest"
+        return SimplifiedLLMOnce(inputs).run()