eval

Release Khoj version 2.0.0-beta.15 #229

Workflow file for this run

.github/workflows/run_evals.yml at edf9ea6

	name: eval

	on:
	# Run on every release
	push:
	tags:
	- "*"
	# Allow manual triggers from GitHub UI
	workflow_dispatch:
	inputs:
	khoj_mode:
	description: 'Khoj Mode (general/default/research)'
	required: true
	default: 'default'
	type: choice
	options:
	- general
	- default
	- research
	dataset:
	description: 'Dataset to evaluate (frames/simpleqa)'
	required: true
	default: 'frames'
	type: choice
	options:
	- frames
	- simpleqa
	- gpqa
	- math500
	sample_size:
	description: 'Number of samples to evaluate'
	required: false
	default: 200
	type: number
	sandbox:
	description: 'Code sandbox to use'
	required: false
	default: 'terrarium'
	type: choice
	options:
	- terrarium
	- e2b
	chat_model:
	description: 'Chat model to use'
	required: false
	default: 'gemini-2.5-flash'
	type: string
	max_research_iterations:
	description: 'Maximum number of iterations in research mode'
	required: false
	default: 5
	type: number
	openai_api_key:
	description: 'OpenAI API key'
	required: false
	default: ''
	type: string
	openai_base_url:
	description: 'Base URL of OpenAI compatible API'
	required: false
	default: 'https://api.openai.com/v1'
	type: string
	auto_read_webpage:
	description: 'Auto read webpage on online search'
	required: false
	default: 'false'
	type: choice
	options:
	- 'false'
	- 'true'
	randomize:
	description: 'Randomize the sample of questions'
	required: false
	default: 'true'
	type: choice
	options:
	- 'false'
	- 'true'

	jobs:
	eval:
	runs-on: ubuntu-latest
	strategy:
	matrix:
	# Use input from manual trigger if available, else run all combinations
	khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) \|\| fromJSON('["general", "default", "research"]') }}
	dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) \|\| fromJSON('["frames", "gpqa"]') }}

	services:
	postgres:
	image: ankane/pgvector
	env:
	POSTGRES_PASSWORD: postgres
	POSTGRES_USER: postgres
	POSTGRES_DB: postgres
	ports:
	- 5432:5432
	options: >-
	--health-cmd pg_isready
	--health-interval 10s
	--health-timeout 5s
	--health-retries 5

	steps:
	- uses: actions/checkout@v3
	with:
	fetch-depth: 0

	- name: Install uv
	uses: astral-sh/setup-uv@v4
	with:
	version: "latest"

	- name: Set up Python
	run: uv python install 3.10

	- name: Get App Version
	id: hatch
	run: \|
	# Mask relevant workflow inputs as secret early
	OPENAI_API_KEY=$(jq -r '.inputs.openai_api_key' $GITHUB_EVENT_PATH)
	echo ::add-mask::$OPENAI_API_KEY
	echo OPENAI_API_KEY="$OPENAI_API_KEY" >> $GITHUB_ENV

	# Get app version from hatch
	echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT

	- name: ⏬️ Install Dependencies
	env:
	DEBIAN_FRONTEND: noninteractive
	run: \|
	# install dependencies
	sudo apt update && sudo apt install -y git libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
	# install terrarium for code sandbox
	git clone https://github.com/khoj-ai/terrarium.git && cd terrarium && npm install --legacy-peer-deps && mkdir pyodide_cache

	- name: ⬇️ Install Application
	env:
	UV_INDEX: "https://download.pytorch.org/whl/cpu"
	UV_INDEX_STRATEGY: "unsafe-best-match"
	CUDA_VISIBLE_DEVICES: ""
	run: \|
	sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
	uv sync --all-extras

	- name: 📝 Run Eval
	env:
	KHOJ_MODE: ${{ matrix.khoj_mode }}
	SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size \|\| 200 }}
	BATCH_SIZE: "20"
	RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize \|\| 'true' }}
	KHOJ_URL: "http://localhost:42110"
	KHOJ_LLM_SEED: "42"
	KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model \|\| 'gemini-2.5-flash' }}
	KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations \|\| 10 }}
	KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage \|\| 'false' }}
	GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
	OPENAI_BASE_URL: ${{ github.event_name == 'workflow_dispatch' && inputs.openai_base_url \|\| 'https://api.openai.com/v1' }}
	SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY \|\| '' }}
	OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY \|\| ''}}
	FIRECRAWL_API_KEY: ${{ matrix.dataset != 'math500' && secrets.FIRECRAWL_API_KEY \|\| '' }}
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	E2B_API_KEY: ${{ inputs.sandbox == 'e2b' && secrets.E2B_API_KEY \|\| '' }}
	E2B_TEMPLATE: ${{ vars.E2B_TEMPLATE }}
	KHOJ_ADMIN_EMAIL: khoj
	KHOJ_ADMIN_PASSWORD: khoj
	POSTGRES_HOST: localhost
	POSTGRES_PORT: 5432
	POSTGRES_USER: postgres
	POSTGRES_PASSWORD: postgres
	POSTGRES_DB: postgres
	USE_EMBEDDED_DB: "true"
	KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
	run: \|
	set -euo pipefail

	# Start Khoj server in background
	# Capture stdout/stderr to a log for debugging if startup fails
	uv run khoj --anonymous-mode --non-interactive -vv > khoj_server.log 2>&1 &
	KHOJ_PID=$!
	echo "Started Khoj (PID=$KHOJ_PID)"

	# Start code sandbox
	npm install -g pm2
	NODE_ENV=production npm run ci --prefix terrarium

	# Wait for server to be ready
	timeout=120
	while ! curl -s http://localhost:42110/api/health > /dev/null; do
	# If process died, surface logs and fail fast
	if ! kill -0 "$KHOJ_PID" 2>/dev/null; then
	echo "Khoj process exited before becoming healthy. Logs:" >&2
	sed -n '1,200p' khoj_server.log >&2 \|\| true
	exit 1
	fi
	if [ $timeout -le 0 ]; then
	echo "Timed out waiting for Khoj server. Partial logs:" >&2
	sed -n '1,200p' khoj_server.log >&2 \|\| true
	exit 1
	fi
	echo "Waiting for Khoj server... ($timeout s left)"
	sleep 2
	timeout=$((timeout-2))
	done
	echo "Khoj server is healthy"

	# Run evals
	uv run python tests/evals/eval.py -d ${{ matrix.dataset }}

	- name: Upload Results
	if: always() # Upload results even if tests fail
	uses: actions/upload-artifact@v4
	with:
	name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
	path: \|
	_evaluation_results_.csv
	_evaluation_summary_.txt
	khoj_server.log

	- name: Display Results
	if: always()
	run: \|
	# Read and display summary
	echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
	echo "*$(head -n 1 _evaluation_summary_.txt)*" >> $GITHUB_STEP_SUMMARY
	echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
	echo "- Chat Model: ${{ inputs.chat_model \|\| 'gemini-2.5-flash' }}" >> $GITHUB_STEP_SUMMARY
	echo "- Code Sandbox: ${{ inputs.sandbox \|\| 'terrarium' }}" >> $GITHUB_STEP_SUMMARY
	echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
	tail -n +2 _evaluation_summary_.txt >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "\`\`\`" >> $GITHUB_STEP_SUMMARY

	# Display in logs too
	echo "===== EVALUATION RESULTS ====="
	cat _evaluation_summary_.txt

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Release Khoj version 2.0.0-beta.15 #229

Workflow file

Release Khoj version 2.0.0-beta.15 #229

Uh oh!

Jobs

Run details

Workflow file for this run