Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Code snippet extraction pipeline step #130

Merged
merged 1 commit into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions src/codegate/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,13 @@ class CodeSnippet:
code: The actual code content
"""

language: str
language: Optional[str]
filepath: Optional[str]
code: str

def __post_init__(self):
if not self.language or not self.language.strip():
raise ValueError("Language must not be empty")
if not self.code or not self.code.strip():
raise ValueError("Code must not be empty")
self.language = self.language.strip().lower()
if self.language is not None:
self.language = self.language.strip().lower()


@dataclass
Expand Down Expand Up @@ -57,6 +55,7 @@ class PipelineResult:

request: Optional[ChatCompletionRequest] = None
response: Optional[PipelineResponse] = None
context: Optional[PipelineContext] = None
error_message: Optional[str] = None

def shortcuts_processing(self) -> bool:
Expand Down Expand Up @@ -165,4 +164,7 @@ async def process_request(
if result.request is not None:
current_request = result.request

return PipelineResult(request=current_request)
if result.context is not None:
context = result.context

return PipelineResult(request=current_request, context=context)
Empty file.
131 changes: 131 additions & 0 deletions src/codegate/pipeline/extract_snippets/extract_snippets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import re
from typing import List, Optional

import structlog
from litellm.types.llms.openai import ChatCompletionRequest

from codegate.pipeline.base import CodeSnippet, PipelineContext, PipelineResult, PipelineStep

CODE_BLOCK_PATTERN = re.compile(
r"```(?:(?P<language>\w+)\s+)?(?P<filename>[^\s\(]+)?(?:\s*\((?P<lineinfo>[^)]+)\))?\n(?P<content>(?:.|\n)*?)```"
)

logger = structlog.get_logger("codegate")

def ecosystem_from_filepath(filepath: str) -> Optional[str]:
"""
Determine language from filepath.

Args:
filepath: Path to the file

Returns:
Determined language based on file extension
"""
# Implement file extension to language mapping
extension_mapping = {
".py": "python",
".js": "javascript",
".ts": "typescript",
".tsx": "typescript",
".go": "go",
".rs": "rust",
".java": "java",
}

# Get the file extension
ext = os.path.splitext(filepath)[1].lower()
return extension_mapping.get(ext, None)


def ecosystem_from_message(message: str) -> Optional[str]:
"""
Determine language from message.

Args:
message: The language from the message. Some extensions send a different
format where the language is present in the snippet,
e.g. "py /path/to/file (lineFrom-lineTo)"

Returns:
Determined language based on message content
"""
language_mapping = {
"py": "python",
"js": "javascript",
"ts": "typescript",
"tsx": "typescript",
"go": "go",
}
return language_mapping.get(message, None)


def extract_snippets(message: str) -> List[CodeSnippet]:
"""
Extract code snippets from a message.

Args:
message: Input text containing code snippets

Returns:
List of extracted code snippets
"""
# Regular expression to find code blocks

snippets: List[CodeSnippet] = []

# Find all code block matches
for match in CODE_BLOCK_PATTERN.finditer(message):
filename = match.group("filename")
content = match.group("content")
matched_language = match.group("language")

# Determine language
lang = None
if matched_language:
lang = ecosystem_from_message(matched_language.strip())
if lang is None and filename:
filename = filename.strip()
# Determine language from the filename
lang = ecosystem_from_filepath(filename)

snippets.append(CodeSnippet(filepath=filename, code=content, language=lang))

return snippets


class CodeSnippetExtractor(PipelineStep):
"""
Pipeline step that merely extracts code snippets from the user message.
"""

def __init__(self):
"""Initialize the CodeSnippetExtractor pipeline step."""
super().__init__()

@property
def name(self) -> str:
return "code-snippet-extractor"

async def process(
self,
request: ChatCompletionRequest,
context: PipelineContext,
) -> PipelineResult:
last_user_message = self.get_last_user_message(request)
if not last_user_message:
return PipelineResult(request=request, context=context)
msg_content, _ = last_user_message
snippets = extract_snippets(msg_content)

logger.info(f"Extracted {len(snippets)} code snippets from the user message")

if len(snippets) > 0:
for snippet in snippets:
logger.debug(f"Code snippet: {snippet}")
context.add_code_snippet(snippet)

return PipelineResult(
context=context,
)
2 changes: 2 additions & 0 deletions src/codegate/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from codegate.config import Config
from codegate.pipeline.base import PipelineStep, SequentialPipelineProcessor
from codegate.pipeline.codegate_system_prompt.codegate import CodegateSystemPrompt
from codegate.pipeline.extract_snippets.extract_snippets import CodeSnippetExtractor
from codegate.pipeline.version.version import CodegateVersion
from codegate.providers.anthropic.provider import AnthropicProvider
from codegate.providers.llamacpp.provider import LlamaCppProvider
Expand All @@ -23,6 +24,7 @@ def init_app() -> FastAPI:

steps: List[PipelineStep] = [
CodegateVersion(),
CodeSnippetExtractor(),
CodegateSystemPrompt(Config.get_config().prompts.codegate_chat),
# CodegateSecrets(),
]
Expand Down
Loading