Release Khoj version 2.0.0-beta.15 #229
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: eval | |
on: | |
# Run on every release | |
push: | |
tags: | |
- "*" | |
# Allow manual triggers from GitHub UI | |
workflow_dispatch: | |
inputs: | |
khoj_mode: | |
description: 'Khoj Mode (general/default/research)' | |
required: true | |
default: 'default' | |
type: choice | |
options: | |
- general | |
- default | |
- research | |
dataset: | |
description: 'Dataset to evaluate (frames/simpleqa)' | |
required: true | |
default: 'frames' | |
type: choice | |
options: | |
- frames | |
- simpleqa | |
- gpqa | |
- math500 | |
sample_size: | |
description: 'Number of samples to evaluate' | |
required: false | |
default: 200 | |
type: number | |
sandbox: | |
description: 'Code sandbox to use' | |
required: false | |
default: 'terrarium' | |
type: choice | |
options: | |
- terrarium | |
- e2b | |
chat_model: | |
description: 'Chat model to use' | |
required: false | |
default: 'gemini-2.5-flash' | |
type: string | |
max_research_iterations: | |
description: 'Maximum number of iterations in research mode' | |
required: false | |
default: 5 | |
type: number | |
openai_api_key: | |
description: 'OpenAI API key' | |
required: false | |
default: '' | |
type: string | |
openai_base_url: | |
description: 'Base URL of OpenAI compatible API' | |
required: false | |
default: 'https://api.openai.com/v1' | |
type: string | |
auto_read_webpage: | |
description: 'Auto read webpage on online search' | |
required: false | |
default: 'false' | |
type: choice | |
options: | |
- 'false' | |
- 'true' | |
randomize: | |
description: 'Randomize the sample of questions' | |
required: false | |
default: 'true' | |
type: choice | |
options: | |
- 'false' | |
- 'true' | |
jobs: | |
eval: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
# Use input from manual trigger if available, else run all combinations | |
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }} | |
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "gpqa"]') }} | |
services: | |
postgres: | |
image: ankane/pgvector | |
env: | |
POSTGRES_PASSWORD: postgres | |
POSTGRES_USER: postgres | |
POSTGRES_DB: postgres | |
ports: | |
- 5432:5432 | |
options: >- | |
--health-cmd pg_isready | |
--health-interval 10s | |
--health-timeout 5s | |
--health-retries 5 | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Install uv | |
uses: astral-sh/setup-uv@v4 | |
with: | |
version: "latest" | |
- name: Set up Python | |
run: uv python install 3.10 | |
- name: Get App Version | |
id: hatch | |
run: | | |
# Mask relevant workflow inputs as secret early | |
OPENAI_API_KEY=$(jq -r '.inputs.openai_api_key' $GITHUB_EVENT_PATH) | |
echo ::add-mask::$OPENAI_API_KEY | |
echo OPENAI_API_KEY="$OPENAI_API_KEY" >> $GITHUB_ENV | |
# Get app version from hatch | |
echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT | |
- name: ⏬️ Install Dependencies | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
run: | | |
# install dependencies | |
sudo apt update && sudo apt install -y git libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6 | |
# install terrarium for code sandbox | |
git clone https://github.com/khoj-ai/terrarium.git && cd terrarium && npm install --legacy-peer-deps && mkdir pyodide_cache | |
- name: ⬇️ Install Application | |
env: | |
UV_INDEX: "https://download.pytorch.org/whl/cpu" | |
UV_INDEX_STRATEGY: "unsafe-best-match" | |
CUDA_VISIBLE_DEVICES: "" | |
run: | | |
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml | |
uv sync --all-extras | |
- name: 📝 Run Eval | |
env: | |
KHOJ_MODE: ${{ matrix.khoj_mode }} | |
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }} | |
BATCH_SIZE: "20" | |
RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize || 'true' }} | |
KHOJ_URL: "http://localhost:42110" | |
KHOJ_LLM_SEED: "42" | |
KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.5-flash' }} | |
KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations || 10 }} | |
KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage || 'false' }} | |
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | |
OPENAI_BASE_URL: ${{ github.event_name == 'workflow_dispatch' && inputs.openai_base_url || 'https://api.openai.com/v1' }} | |
SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY || '' }} | |
OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY || ''}} | |
FIRECRAWL_API_KEY: ${{ matrix.dataset != 'math500' && secrets.FIRECRAWL_API_KEY || '' }} | |
HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
E2B_API_KEY: ${{ inputs.sandbox == 'e2b' && secrets.E2B_API_KEY || '' }} | |
E2B_TEMPLATE: ${{ vars.E2B_TEMPLATE }} | |
KHOJ_ADMIN_EMAIL: khoj | |
KHOJ_ADMIN_PASSWORD: khoj | |
POSTGRES_HOST: localhost | |
POSTGRES_PORT: 5432 | |
POSTGRES_USER: postgres | |
POSTGRES_PASSWORD: postgres | |
POSTGRES_DB: postgres | |
USE_EMBEDDED_DB: "true" | |
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests | |
run: | | |
set -euo pipefail | |
# Start Khoj server in background | |
# Capture stdout/stderr to a log for debugging if startup fails | |
uv run khoj --anonymous-mode --non-interactive -vv > khoj_server.log 2>&1 & | |
KHOJ_PID=$! | |
echo "Started Khoj (PID=$KHOJ_PID)" | |
# Start code sandbox | |
npm install -g pm2 | |
NODE_ENV=production npm run ci --prefix terrarium | |
# Wait for server to be ready | |
timeout=120 | |
while ! curl -s http://localhost:42110/api/health > /dev/null; do | |
# If process died, surface logs and fail fast | |
if ! kill -0 "$KHOJ_PID" 2>/dev/null; then | |
echo "Khoj process exited before becoming healthy. Logs:" >&2 | |
sed -n '1,200p' khoj_server.log >&2 || true | |
exit 1 | |
fi | |
if [ $timeout -le 0 ]; then | |
echo "Timed out waiting for Khoj server. Partial logs:" >&2 | |
sed -n '1,200p' khoj_server.log >&2 || true | |
exit 1 | |
fi | |
echo "Waiting for Khoj server... ($timeout s left)" | |
sleep 2 | |
timeout=$((timeout-2)) | |
done | |
echo "Khoj server is healthy" | |
# Run evals | |
uv run python tests/evals/eval.py -d ${{ matrix.dataset }} | |
- name: Upload Results | |
if: always() # Upload results even if tests fail | |
uses: actions/upload-artifact@v4 | |
with: | |
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }} | |
path: | | |
*_evaluation_results_*.csv | |
*_evaluation_summary_*.txt | |
khoj_server.log | |
- name: Display Results | |
if: always() | |
run: | | |
# Read and display summary | |
echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY | |
echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY | |
echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY | |
echo "- Chat Model: ${{ inputs.chat_model || 'gemini-2.5-flash' }}" >> $GITHUB_STEP_SUMMARY | |
echo "- Code Sandbox: ${{ inputs.sandbox || 'terrarium' }}" >> $GITHUB_STEP_SUMMARY | |
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
# Display in logs too | |
echo "===== EVALUATION RESULTS =====" | |
cat *_evaluation_summary_*.txt |