Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL Advanced"

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
schedule:
- cron: '44 23 * * 6'

jobs:
analyze:
name: Analyze (${{ matrix.language }})
# Runner size impacts CodeQL analysis time. To learn more, please see:
# - https://gh.io/recommended-hardware-resources-for-running-codeql
# - https://gh.io/supported-runners-and-hardware-resources
# - https://gh.io/using-larger-runners (GitHub.com only)
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
permissions:
# required for all workflows
security-events: write

# required to fetch internal or private CodeQL packs
packages: read

# only required for workflows in private repositories
actions: read
contents: read

strategy:
fail-fast: false
matrix:
include:
- language: python
build-mode: none
# CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift'
# Use `c-cpp` to analyze code written in C, C++ or both
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
steps:
- name: Checkout repository
uses: actions/checkout@v4

# Add any setup steps before running the `github/codeql-action/init` action.
# This includes steps like installing compilers or runtimes (`actions/setup-node`
# or others). This is typically only required for manual builds.
# - name: Setup runtime (example)
# uses: actions/setup-example@v1

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v4
with:
languages: ${{ matrix.language }}
build-mode: ${{ matrix.build-mode }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.

# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality

# If the analyze step fails for one of the languages you are analyzing with
# "We were unable to automatically build your code", modify the matrix above
# to set the build mode to "manual" for that language. Then modify this step
# to build your code.
# ℹ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
- name: Run manual build steps
if: matrix.build-mode == 'manual'
shell: bash
run: |
echo 'If you are using a "manual" build mode for one or more of the' \
'languages you are analyzing, replace this with the commands to build' \
'your code, for example:'
echo ' make bootstrap'
echo ' make release'
exit 1

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v4
with:
category: "/language:${{matrix.language}}"
222 changes: 222 additions & 0 deletions nl_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import difflib
import re
from difflib import SequenceMatcher
from typing import Dict, Any, List, Tuple

# Vocabulary for typo correction
VOCAB = {
"python", "pip", "venv", "virtualenv", "conda", "anaconda",
"docker", "kubernetes", "k8s", "kubectl",
"nginx", "apache", "httpd", "web", "server",
"flask", "django", "tensorflow", "pytorch", "torch",
"install", "setup", "development", "env", "environment",
}

# Canonical examples for lightweight semantic matching
INTENT_EXAMPLES = {
"install_ml": [
"install something for machine learning",
"install pytorch",
"install tensorflow",
"i want to run pytorch",
],
"install_web_server": [
"i need a web server",
"install nginx",
"install apache",
"set up a web server",
],
"setup_python_env": [
"set up python development environment",
"install python 3.10",
"create python venv",
"setup dev env",
],
"install_docker": [
"install docker",
"add docker",
"deploy containers - docker",
],
"install_docker_k8s": [
"install docker and kubernetes",
"docker and k8s",
"k8s and docker on my mac",
],
}


def normalize(text: str) -> str:
text = text.lower()
text = text.replace("-", " ")
text = re.sub(r"[^a-z0-9.\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text


def tokenize(text: str) -> List[str]:
return text.split()


def spell_correct_token(token: str) -> Tuple[str, bool]:
"""Return corrected_token, was_corrected"""
if token in VOCAB:
return token, False
close = difflib.get_close_matches(token, VOCAB, n=1, cutoff=0.75)
if close:
return close[0], True
return token, False


def apply_spell_correction(tokens: List[str]) -> Tuple[List[str], List[Tuple[str, str]]]:
corrections = []
new_tokens = []
for t in tokens:
new, fixed = spell_correct_token(t)
if fixed:
corrections.append((t, new))
new_tokens.append(new)
return new_tokens, corrections


def fuzzy_phrase_score(a: str, b: str) -> float:
return SequenceMatcher(None, a, b).ratio()


def semantic_intent_score(text: str) -> Tuple[str, float]:
"""Compare text with intent examples."""
best_intent = "unknown"
best_score = 0.0

for intent, examples in INTENT_EXAMPLES.items():
for ex in examples:
score = fuzzy_phrase_score(text, ex)
if score > best_score:
best_score = score
best_intent = intent

return best_intent, best_score


def rule_intent(text: str) -> Tuple[str, float]:
"""Simple keyword/rule-based detection."""
t = text

if "docker" in t:
if "kubernetes" in t or "k8s" in t or "kubectl" in t:
return "install_docker_k8s", 0.95
return "install_docker", 0.9

if "kubernetes" in t or "k8s" in t or "kubectl" in t:
return "install_docker_k8s", 0.9

if "nginx" in t or "apache" in t or "httpd" in t or "web server" in t:
return "install_web_server", 0.9

if "python" in t or "venv" in t or "conda" in t or "anaconda" in t:
return "setup_python_env", 0.9

if any(word in t for word in ("tensorflow", "pytorch", "torch", "machine learning", "ml")):
return "install_ml", 0.9

return "unknown", 0.0


VERSION_RE = re.compile(r"python\s*([0-9]+(?:\.[0-9]+)?)")

Check warning on line 124 in nl_parser.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use concise character class syntax '\d' instead of '[0-9]'.

See more on https://sonarcloud.io/project/issues?id=cortexlinux_cortex&issues=AZsOK8Kz3zSehC9xcj3b&open=AZsOK8Kz3zSehC9xcj3b&pullRequest=293

Check warning on line 124 in nl_parser.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use concise character class syntax '\d' instead of '[0-9]'.

See more on https://sonarcloud.io/project/issues?id=cortexlinux_cortex&issues=AZsOK8Kz3zSehC9xcj3a&open=AZsOK8Kz3zSehC9xcj3a&pullRequest=293
PLATFORM_RE = re.compile(r"\b(mac|macos|windows|linux|ubuntu|debian)\b")
PACKAGE_RE = re.compile(r"\b(nginx|apache|docker|kubernetes|k8s|kubectl|python|pip|venv|conda|tensorflow|pytorch)\b")


def extract_slots(text: str) -> Dict[str, Any]:
slots = {}

v = VERSION_RE.search(text)
if v:
slots["python_version"] = v.group(1)

p = PLATFORM_RE.search(text)
if p:
slots["platform"] = p.group(1)

pkgs = PACKAGE_RE.findall(text)
if pkgs:
slots["packages"] = list(dict.fromkeys(pkgs)) # unique preserve order

return slots


def aggregate_confidence(c_rule, c_sem, num_corrections, c_classifier=0.0):
penalty = 1 - (num_corrections * 0.1)
penalty = max(0.0, penalty)

final = (
0.4 * c_rule +
0.4 * c_sem +
0.2 * c_classifier
) * penalty

return round(max(0.0, min(1.0, final)), 2)


def decide_clarifications(intent, confidence):
if intent == "unknown" or confidence < 0.6:
return [
"Install Docker and Kubernetes",
"Set up Python development environment",
"Install a web server (nginx/apache)",
"Install ML libraries (tensorflow/pytorch)",
]
if intent == "setup_python_env" and confidence < 0.75:
return ["Use venv", "Use conda", "Install a specific Python version"]
return []


def parse_request(text: str) -> Dict[str, Any]:
"""Main function used by tests and demo."""
norm = normalize(text)
tokens = tokenize(norm)

tokens_corr, corrections = apply_spell_correction(tokens)
corrected_text = " ".join(tokens_corr)

rule_int, c_rule = rule_intent(corrected_text)
sem_int, c_sem = semantic_intent_score(corrected_text)

if rule_int != "unknown" and rule_int == sem_int:
chosen_intent = rule_int
c_classifier = 0.95
elif rule_int != "unknown":
chosen_intent = rule_int
c_classifier = 0.0
elif c_sem > 0.6:
chosen_intent = sem_int
c_classifier = 0.0
else:
chosen_intent = "unknown"
c_classifier = 0.0

slots = extract_slots(corrected_text)

confidence = aggregate_confidence(
c_rule, c_sem, len(corrections), c_classifier
)

clarifications = decide_clarifications(chosen_intent, confidence)

explanation = []
if corrections:
explanation.append(
"corrected: " + ", ".join(f"{a}->{b}" for a, b in corrections)
)
explanation.append(f"rule_intent={rule_int} ({c_rule:.2f})")
explanation.append(f"semantic_match={sem_int} ({c_sem:.2f})")

return {
"intent": chosen_intent,
"confidence": confidence,
"explanation": "; ".join(explanation),
"slots": slots,
"corrections": corrections,
"clarifications": clarifications,
}


37 changes: 37 additions & 0 deletions tests/test_nl_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest
from nl_parser import parse_request

@pytest.mark.parametrize("text,expected", [
("install something for machine learning", "install_ml"),
("I need a web server", "install_web_server"),
("set up python development environment", "setup_python_env"),
("install docker and kubernets", "install_docker_k8s"),
("Can you provision a python env with pip, venv and flake8?", "setup_python_env"),
("need nginx or apache for a website", "install_web_server"),
("deploy containers - docker", "install_docker"),
("k8s and docker on my mac", "install_docker_k8s"),
("i want to run pytorch", "install_ml"),
("setup dev env", "ambiguous"),
("add docker", "install_docker"),
("pls install pyhton 3.10", "setup_python_env"),
])
def test_intent(text, expected):
result = parse_request(text)
intent = result["intent"]
confidence = result["confidence"]

if expected == "ambiguous":
assert result["clarifications"], f"Expected clarifications for: {text}"
else:
assert intent == expected
assert confidence >= 0.5

def test_corrections():
r = parse_request("install docker and kubernets")
assert r["intent"] == "install_docker_k8s"
assert any(orig == "kubernets" for orig, _ in r["corrections"])

def test_slot_extraction():
r = parse_request("pls install python 3.10 on mac")
assert r["slots"].get("python_version") == "3.10"
assert r["slots"].get("platform") in ("mac", "macos")