Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Switch usage of re package for regex which is slightly more performant #1127

Merged
merged 1 commit into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ onnxruntime = "==1.20.1"
onnx = "==1.17.0"
spacy = "<3.8.0"
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"}
regex = "==2024.11.6"

[tool.poetry.group.dev.dependencies]
pytest = "==8.3.4"
Expand Down
2 changes: 1 addition & 1 deletion src/codegate/api/v1_processing.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import asyncio
import json
import re
from collections import defaultdict
from typing import AsyncGenerator, Dict, List, Optional, Tuple

import cachetools.func
import regex as re
import requests
import structlog

Expand Down
2 changes: 1 addition & 1 deletion src/codegate/clients/detector.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
from abc import ABC, abstractmethod
from functools import wraps
from typing import List, Optional

import regex as re
import structlog
from fastapi import Request

Expand Down
11 changes: 8 additions & 3 deletions src/codegate/db/fim_cache.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime
import hashlib
import json
import re
from typing import Dict, List, Optional

import regex as re
import structlog
from pydantic import BaseModel

Expand All @@ -21,6 +21,11 @@ class CachedFim(BaseModel):
initial_id: str


# Regular expression to match file paths in FIM messages.
# Compiled regex to improve performance.
filepath_matcher = re.compile(r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b", re.MULTILINE)


class FimCache:

def __init__(self):
Expand Down Expand Up @@ -55,8 +60,8 @@ def _match_filepath(self, message: str, provider: str) -> Optional[str]:
# folder/testing_file.py
# Path: file3.py
# // Path: file3.js <-- Javascript
pattern = r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b"
matches = re.findall(pattern, message, re.MULTILINE)
matches = filepath_matcher.findall(message)

# If no path is found, hash the entire prompt message.
if not matches:
return None
Expand Down
2 changes: 1 addition & 1 deletion src/codegate/extract_snippets/message_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Self

import regex as re
import structlog
from pydantic import BaseModel, field_validator, model_validator
from pygments.lexers import guess_lexer
Expand Down
2 changes: 1 addition & 1 deletion src/codegate/pipeline/cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
import shlex
from typing import Optional

import regex as re
from litellm import ChatCompletionRequest

from codegate.clients.clients import ClientType
Expand Down
16 changes: 10 additions & 6 deletions src/codegate/pipeline/codegate_context_retriever/codegate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import re

import regex as re
import structlog
from litellm import ChatCompletionRequest

Expand All @@ -19,6 +19,12 @@
logger = structlog.get_logger("codegate")


# Pre-compiled regex patterns for performance
markdown_code_block = re.compile(r"```.*?```", flags=re.DOTALL)
markdown_file_listing = re.compile(r"⋮...*?⋮...\n\n", flags=re.DOTALL)
environment_details = re.compile(r"<environment_details>.*?</environment_details>", flags=re.DOTALL)


class CodegateContextRetriever(PipelineStep):
"""
Pipeline step that adds a context message to the completion request when it detects
Expand Down Expand Up @@ -95,11 +101,9 @@ async def process( # noqa: C901

# Remove code snippets and file listing from the user messages and search for bad packages
# in the rest of the user query/messsages
user_messages = re.sub(r"```.*?```", "", user_message, flags=re.DOTALL)
user_messages = re.sub(r"⋮...*?⋮...\n\n", "", user_messages, flags=re.DOTALL)
user_messages = re.sub(
r"<environment_details>.*?</environment_details>", "", user_messages, flags=re.DOTALL
)
user_messages = markdown_code_block.sub("", user_message)
user_messages = markdown_file_listing.sub("", user_messages)
user_messages = environment_details.sub("", user_messages)

# split messages into double newlines, to avoid passing so many content in the search
split_messages = re.split(r"</?task>|\n|\\n", user_messages)
Expand Down
2 changes: 1 addition & 1 deletion src/codegate/pipeline/pii/pii.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from typing import Any, Dict, List, Optional

import regex as re
import structlog
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
from litellm.types.utils import Delta, StreamingChoices
Expand Down
2 changes: 1 addition & 1 deletion src/codegate/pipeline/secrets/secrets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from abc import abstractmethod
from typing import List, Optional, Tuple

import regex as re
import structlog
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
from litellm.types.utils import Delta, StreamingChoices
Expand Down
2 changes: 1 addition & 1 deletion src/codegate/pipeline/secrets/signatures.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# signatures.py
import math
import re
from pathlib import Path
from threading import Lock
from typing import ClassVar, Dict, List, NamedTuple, Optional, Union

import regex as re
import structlog
import yaml

Expand Down
7 changes: 5 additions & 2 deletions src/codegate/providers/copilot/provider.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import asyncio
import datetime
import os
import re
import ssl
import tempfile
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import unquote, urljoin, urlparse

import regex as re
import structlog
from litellm.types.utils import Delta, ModelResponse, StreamingChoices

Expand All @@ -29,6 +29,9 @@
setup_logging()
logger = structlog.get_logger("codegate").bind(origin="copilot_proxy")

# Pre-compiled regex patterns for performance
proxy_ep_pattern = re.compile(r"proxy-ep=([^;]+)")


TEMPDIR = None
if os.getenv("CODEGATE_DUMP_DIR"):
Expand Down Expand Up @@ -613,7 +616,7 @@ async def _get_target_url(self, complete_request) -> Optional[str]:
auth_header = headers_dict.get("authorization", "")

if auth_header:
match = re.search(r"proxy-ep=([^;]+)", auth_header)
match = proxy_ep_pattern.search(auth_header)
if match:
self.proxy_ep = match.group(1)
if not urlparse(self.proxy_ep).scheme:
Expand Down
11 changes: 8 additions & 3 deletions src/codegate/storage/storage_engine.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import re
import sqlite3
from typing import List, Optional

import numpy as np
import regex as re
import sqlite_vec_sl_tmp
import structlog

Expand All @@ -21,6 +21,11 @@
}


# Pre-compiled regex patterns for performance
alpha_numeric_pattern = re.compile(r"[^\w\s]*$")
non_alphanumeric_pattern = re.compile(r"[^\w@\/\.-]")


class StorageEngine:
__storage_engine = None

Expand Down Expand Up @@ -231,11 +236,11 @@ async def search(
query_words = None
if query:
# Remove all non alphanumeric characters at the end of the string
cleaned_query = re.sub(r"[^\w\s]*$", "", query.lower())
cleaned_query = alpha_numeric_pattern.sub("", query.lower())

# Remove all non alphanumeric characters in the middle of the string
# except @, /, . and -
cleaned_query = re.sub(r"[^\w@\/\.-]", " ", cleaned_query)
cleaned_query = non_alphanumeric_pattern.sub(" ", cleaned_query)

# Tokenize the cleaned query
query_words = cleaned_query.split()
Expand Down
Loading