Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL Advanced"

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
schedule:
- cron: '44 23 * * 6'

jobs:
analyze:
name: Analyze (${{ matrix.language }})
# Runner size impacts CodeQL analysis time. To learn more, please see:
# - https://gh.io/recommended-hardware-resources-for-running-codeql
# - https://gh.io/supported-runners-and-hardware-resources
# - https://gh.io/using-larger-runners (GitHub.com only)
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
permissions:
# required for all workflows
security-events: write

# required to fetch internal or private CodeQL packs
packages: read

# only required for workflows in private repositories
actions: read
contents: read

strategy:
fail-fast: false
matrix:
include:
- language: python
build-mode: none
# CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift'
# Use `c-cpp` to analyze code written in C, C++ or both
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
steps:
- name: Checkout repository
uses: actions/checkout@v4

# Add any setup steps before running the `github/codeql-action/init` action.
# This includes steps like installing compilers or runtimes (`actions/setup-node`
# or others). This is typically only required for manual builds.
# - name: Setup runtime (example)
# uses: actions/setup-example@v1

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v4
with:
languages: ${{ matrix.language }}
build-mode: ${{ matrix.build-mode }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.

# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality

# If the analyze step fails for one of the languages you are analyzing with
# "We were unable to automatically build your code", modify the matrix above
# to set the build mode to "manual" for that language. Then modify this step
# to build your code.
# ℹ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
- name: Run manual build steps
if: matrix.build-mode == 'manual'
shell: bash
run: |
echo 'If you are using a "manual" build mode for one or more of the' \
'languages you are analyzing, replace this with the commands to build' \
'your code, for example:'
echo ' make bootstrap'
echo ' make release'
exit 1

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v4
with:
category: "/language:${{matrix.language}}"
222 changes: 222 additions & 0 deletions cortex/nl_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import difflib
import re
from difflib import SequenceMatcher
from typing import Dict, Any, List, Tuple

# Vocabulary for typo correction
VOCAB = {
"python", "pip", "venv", "virtualenv", "conda", "anaconda",
"docker", "kubernetes", "k8s", "kubectl",
"nginx", "apache", "httpd", "web", "server",
"flask", "django", "tensorflow", "pytorch", "torch",
"install", "setup", "development", "env", "environment",
}

# Canonical examples for lightweight semantic matching
INTENT_EXAMPLES = {
"install_ml": [
"install something for machine learning",
"install pytorch",
"install tensorflow",
"i want to run pytorch",
],
"install_web_server": [
"i need a web server",
"install nginx",
"install apache",
"set up a web server",
],
"setup_python_env": [
"set up python development environment",
"install python 3.10",
"create python venv",
"setup dev env",
],
"install_docker": [
"install docker",
"add docker",
"deploy containers - docker",
],
"install_docker_k8s": [
"install docker and kubernetes",
"docker and k8s",
"k8s and docker on my mac",
],
}


def normalize(text: str) -> str:
text = text.lower()
text = text.replace("-", " ")
text = re.sub(r"[^a-z0-9.\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text


def tokenize(text: str) -> List[str]:
return text.split()
Comment on lines +56 to +57
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add docstring to public function.

Per coding guidelines, docstrings are required for all public APIs.

Apply this diff:

 def tokenize(text: str) -> List[str]:
+    """Split text into tokens by whitespace."""
     return text.split()
🤖 Prompt for AI Agents
In nl_parser.py around lines 56 to 57, the public function tokenize lacks a
docstring; add a concise one-line docstring (or short multi-line if needed)
explaining the function purpose, parameters, and return type (e.g., "Split input
text into tokens and return a list of token strings."), include type/param and
return descriptions consistent with project docstring style, and place it
immediately below the def tokenize(...) line.



def spell_correct_token(token: str) -> Tuple[str, bool]:
"""Return corrected_token, was_corrected"""
if token in VOCAB:
return token, False
close = difflib.get_close_matches(token, VOCAB, n=1, cutoff=0.75)
if close:
return close[0], True
return token, False


def apply_spell_correction(tokens: List[str]) -> Tuple[List[str], List[Tuple[str, str]]]:
corrections = []
new_tokens = []
for t in tokens:
new, fixed = spell_correct_token(t)
if fixed:
corrections.append((t, new))
new_tokens.append(new)
return new_tokens, corrections
Comment on lines +70 to +78
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add docstring to public function.

Per coding guidelines, docstrings are required for all public APIs.

Apply this diff:

 def apply_spell_correction(tokens: List[str]) -> Tuple[List[str], List[Tuple[str, str]]]:
+    """Apply spell correction to all tokens.
+    
+    Returns:
+        Tuple of (corrected_tokens, corrections) where corrections is a list of (original, corrected) pairs.
+    """
     corrections = []
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def apply_spell_correction(tokens: List[str]) -> Tuple[List[str], List[Tuple[str, str]]]:
corrections = []
new_tokens = []
for t in tokens:
new, fixed = spell_correct_token(t)
if fixed:
corrections.append((t, new))
new_tokens.append(new)
return new_tokens, corrections
def apply_spell_correction(tokens: List[str]) -> Tuple[List[str], List[Tuple[str, str]]]:
"""Apply spell correction to all tokens.
Returns:
Tuple of (corrected_tokens, corrections) where corrections is a list of (original, corrected) pairs.
"""
corrections = []
new_tokens = []
for t in tokens:
new, fixed = spell_correct_token(t)
if fixed:
corrections.append((t, new))
new_tokens.append(new)
return new_tokens, corrections
🤖 Prompt for AI Agents
In nl_parser.py around lines 70 to 78, the public function
apply_spell_correction is missing a docstring; add a concise triple-quoted
docstring immediately below the def that documents the parameters (tokens:
List[str]), the return value (Tuple[List[str], List[Tuple[str, str]]]) and
briefly explains that it returns the corrected token list and a list of
(original, corrected) pairs for any tokens that were changed; keep the wording
short and use the project's docstring style.



def fuzzy_phrase_score(a: str, b: str) -> float:
return SequenceMatcher(None, a, b).ratio()
Comment on lines +81 to +82
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add docstring to public function.

Per coding guidelines, docstrings are required for all public APIs.

Apply this diff:

 def fuzzy_phrase_score(a: str, b: str) -> float:
+    """Compute similarity ratio between two strings (0.0 to 1.0)."""
     return SequenceMatcher(None, a, b).ratio()
🤖 Prompt for AI Agents
In nl_parser.py around lines 81 to 82, the public function fuzzy_phrase_score
lacks a docstring; add a concise triple-quoted docstring immediately below the
def that describes the function purpose, documents parameters (a: str, b: str)
and their expected values, states the return type (float) and meaning
(similarity ratio between 0 and 1), and optionally notes any behavior/usage
(uses SequenceMatcher). Keep it brief and follow project docstring style.



def semantic_intent_score(text: str) -> Tuple[str, float]:
"""Compare text with intent examples."""
best_intent = "unknown"
best_score = 0.0

for intent, examples in INTENT_EXAMPLES.items():
for ex in examples:
score = fuzzy_phrase_score(text, ex)
if score > best_score:
best_score = score
best_intent = intent

return best_intent, best_score


def rule_intent(text: str) -> Tuple[str, float]:
"""Simple keyword/rule-based detection."""
t = text

if "docker" in t:
if "kubernetes" in t or "k8s" in t or "kubectl" in t:
return "install_docker_k8s", 0.95
return "install_docker", 0.9

if "kubernetes" in t or "k8s" in t or "kubectl" in t:
return "install_docker_k8s", 0.9
Comment on lines +109 to +110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

rg -n -C5 'install_docker_k8s|install_kubernetes' --type=py

Repository: cortexlinux/cortex

Length of output: 3094


🏁 Script executed:

# Search for all intents defined in the code
rg -n '"install_' --type=py -A 1 -B 1 | head -100

# Check for any comments explaining the design rationale
rg -n 'kubernetes|k8s' --type=py -B 2 -A 2 | grep -i 'comment\|#\|reason\|design' | head -30

Repository: cortexlinux/cortex

Length of output: 2649


Add test coverage and clarify kubernetes-only intent handling.

Lines 109–110 return install_docker_k8s when the user mentions only kubernetes/k8s keywords (without docker). However:

  • The INTENT_EXAMPLES for install_docker_k8s (lines 40–44) only show examples with both docker AND kubernetes.
  • No test case covers kubernetes-only input (e.g., "install kubernetes").
  • No separate install_kubernetes intent exists.

Either: (1) add an install_kubernetes intent with corresponding examples and test cases, or (2) if kubernetes always requires docker in the cortex install flow, add a test case for kubernetes-only input and a comment explaining this design choice.


if "nginx" in t or "apache" in t or "httpd" in t or "web server" in t:
return "install_web_server", 0.9

if "python" in t or "venv" in t or "conda" in t or "anaconda" in t:
return "setup_python_env", 0.9

if any(word in t for word in ("tensorflow", "pytorch", "torch", "machine learning", "ml")):
return "install_ml", 0.9

return "unknown", 0.0


VERSION_RE = re.compile(r"python\s*([0-9]+(?:\.[0-9]+)?)")

Check warning on line 124 in cortex/nl_parser.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use concise character class syntax '\d' instead of '[0-9]'.

See more on https://sonarcloud.io/project/issues?id=cortexlinux_cortex&issues=AZsOK8Kz3zSehC9xcj3b&open=AZsOK8Kz3zSehC9xcj3b&pullRequest=293

Check warning on line 124 in cortex/nl_parser.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use concise character class syntax '\d' instead of '[0-9]'.

See more on https://sonarcloud.io/project/issues?id=cortexlinux_cortex&issues=AZsOK8Kz3zSehC9xcj3a&open=AZsOK8Kz3zSehC9xcj3a&pullRequest=293
PLATFORM_RE = re.compile(r"\b(mac|macos|windows|linux|ubuntu|debian)\b")
PACKAGE_RE = re.compile(r"\b(nginx|apache|docker|kubernetes|k8s|kubectl|python|pip|venv|conda|tensorflow|pytorch)\b")
Comment on lines +124 to +126
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add type annotations for regex constants.

As per coding guidelines, type hints are required for module-level constants.

Apply this diff:

+import re
+from typing import Pattern
+
-VERSION_RE = re.compile(r"python\s*([0-9]+(?:\.[0-9]+)?)")
-PLATFORM_RE = re.compile(r"\b(mac|macos|windows|linux|ubuntu|debian)\b")
-PACKAGE_RE = re.compile(r"\b(nginx|apache|docker|kubernetes|k8s|kubectl|python|pip|venv|conda|tensorflow|pytorch)\b")
+VERSION_RE: Pattern[str] = re.compile(r"python\s*([0-9]+(?:\.[0-9]+)?)")
+PLATFORM_RE: Pattern[str] = re.compile(r"\b(mac|macos|windows|linux|ubuntu|debian)\b")
+PACKAGE_RE: Pattern[str] = re.compile(r"\b(nginx|apache|docker|kubernetes|k8s|kubectl|python|pip|venv|conda|tensorflow|pytorch)\b")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
VERSION_RE = re.compile(r"python\s*([0-9]+(?:\.[0-9]+)?)")
PLATFORM_RE = re.compile(r"\b(mac|macos|windows|linux|ubuntu|debian)\b")
PACKAGE_RE = re.compile(r"\b(nginx|apache|docker|kubernetes|k8s|kubectl|python|pip|venv|conda|tensorflow|pytorch)\b")
VERSION_RE: re.Pattern[str] = re.compile(r"python\s*([0-9]+(?:\.[0-9]+)?)")
PLATFORM_RE: re.Pattern[str] = re.compile(r"\b(mac|macos|windows|linux|ubuntu|debian)\b")
PACKAGE_RE: re.Pattern[str] = re.compile(r"\b(nginx|apache|docker|kubernetes|k8s|kubectl|python|pip|venv|conda|tensorflow|pytorch)\b")
🤖 Prompt for AI Agents
In nl_parser.py around lines 124 to 126, the regex constant declarations lack
type annotations; add explicit type hints (e.g. Pattern[str] or re.Pattern[str])
for VERSION_RE, PLATFORM_RE, and PACKAGE_RE and ensure the appropriate import
(from typing import Pattern or use re.Pattern depending on project conventions)
is present at the top of the module so each constant is declared with its regex
type.



def extract_slots(text: str) -> Dict[str, Any]:
slots = {}

v = VERSION_RE.search(text)
if v:
slots["python_version"] = v.group(1)

p = PLATFORM_RE.search(text)
if p:
slots["platform"] = p.group(1)

pkgs = PACKAGE_RE.findall(text)
if pkgs:
slots["packages"] = list(dict.fromkeys(pkgs)) # unique preserve order

return slots


def aggregate_confidence(c_rule, c_sem, num_corrections, c_classifier=0.0):
penalty = 1 - (num_corrections * 0.1)
penalty = max(0.0, penalty)

final = (
0.4 * c_rule +
0.4 * c_sem +
0.2 * c_classifier
) * penalty

return round(max(0.0, min(1.0, final)), 2)


def decide_clarifications(intent, confidence):
if intent == "unknown" or confidence < 0.6:
return [
"Install Docker and Kubernetes",
"Set up Python development environment",
"Install a web server (nginx/apache)",
"Install ML libraries (tensorflow/pytorch)",
]
if intent == "setup_python_env" and confidence < 0.75:
return ["Use venv", "Use conda", "Install a specific Python version"]
return []


def parse_request(text: str) -> Dict[str, Any]:
"""Main function used by tests and demo."""
norm = normalize(text)
tokens = tokenize(norm)

tokens_corr, corrections = apply_spell_correction(tokens)
corrected_text = " ".join(tokens_corr)

rule_int, c_rule = rule_intent(corrected_text)
sem_int, c_sem = semantic_intent_score(corrected_text)

if rule_int != "unknown" and rule_int == sem_int:
chosen_intent = rule_int
c_classifier = 0.95
elif rule_int != "unknown":
chosen_intent = rule_int
c_classifier = 0.0
elif c_sem > 0.6:
chosen_intent = sem_int
c_classifier = 0.0
else:
chosen_intent = "unknown"
c_classifier = 0.0

slots = extract_slots(corrected_text)

confidence = aggregate_confidence(
c_rule, c_sem, len(corrections), c_classifier
)

clarifications = decide_clarifications(chosen_intent, confidence)

explanation = []
if corrections:
explanation.append(
"corrected: " + ", ".join(f"{a}->{b}" for a, b in corrections)
)
explanation.append(f"rule_intent={rule_int} ({c_rule:.2f})")
explanation.append(f"semantic_match={sem_int} ({c_sem:.2f})")

return {
"intent": chosen_intent,
"confidence": confidence,
"explanation": "; ".join(explanation),
"slots": slots,
"corrections": corrections,
"clarifications": clarifications,
}


32 changes: 32 additions & 0 deletions docs/NL_PARSER.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Natural Language Parser (NL Parser)

## Overview
The NL Parser enables users to describe installation requests in natural language
(e.g., “install docker and kubernetes” or “set up python dev environment”).
It converts free-form text into structured intents that Cortex can act upon.

This improves demo reliability and usability by removing the need for strict
command syntax.

---

## Key Features
- Typo tolerance (e.g., kubernets → kubernetes, pyhton → python)
- Rule-based + fuzzy semantic intent detection
- Confidence scoring for detected intent
- Clarification prompts for ambiguous requests
- Slot extraction (python version, platform, packages)
- Lightweight, dependency-free core logic

---

## Usage Example

```python
from cortex.nl_parser import parse_request

result = parse_request("pls install pyhton 3.10 on mac")

print(result)


37 changes: 37 additions & 0 deletions tests/test_nl_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest
from nl_parser import parse_request

@pytest.mark.parametrize("text,expected", [
("install something for machine learning", "install_ml"),
("I need a web server", "install_web_server"),
("set up python development environment", "setup_python_env"),
("install docker and kubernets", "install_docker_k8s"),
("Can you provision a python env with pip, venv and flake8?", "setup_python_env"),
("need nginx or apache for a website", "install_web_server"),
("deploy containers - docker", "install_docker"),
("k8s and docker on my mac", "install_docker_k8s"),
("i want to run pytorch", "install_ml"),
("setup dev env", "ambiguous"),
("add docker", "install_docker"),
("pls install pyhton 3.10", "setup_python_env"),
])
def test_intent(text, expected):
result = parse_request(text)
intent = result["intent"]
confidence = result["confidence"]

if expected == "ambiguous":
assert result["clarifications"], f"Expected clarifications for: {text}"
else:
assert intent == expected
assert confidence >= 0.5

def test_corrections():
r = parse_request("install docker and kubernets")
assert r["intent"] == "install_docker_k8s"
assert any(orig == "kubernets" for orig, _ in r["corrections"])

def test_slot_extraction():
r = parse_request("pls install python 3.10 on mac")
assert r["slots"].get("python_version") == "3.10"
assert r["slots"].get("platform") in ("mac", "macos")