Scale-tested evolutionary prompt search (EPS) platform hosting tens of thousands of prompts with evaluation scores. Integrates with Eval-Genius and Async-Toolformer-Orchestrator for continuous A/B testing and prompt optimization at scale.
Meta-Prompt-Evolution-Hub uses evolutionary algorithms to automatically discover, optimize, and maintain high-performing prompts across diverse tasks. The platform continuously evolves prompt populations through mutation, crossover, and selection based on real-world performance metrics.
- Massive Scale: Handle 100K+ prompts with distributed evaluation
- Evolutionary Optimization: Genetic algorithms for prompt discovery
- Continuous A/B Testing: Real-time performance tracking
- Multi-Objective: Optimize for accuracy, cost, latency, and safety
- Version Control: Git-like prompt versioning and branching
- Auto-Documentation: Self-documenting prompt genealogy
# Basic installation
pip install meta-prompt-evolution-hub
# With distributed computing
pip install meta-prompt-evolution-hub[distributed]
# With all evaluation frameworks
pip install meta-prompt-evolution-hub[full]
# Development installation
git clone https://github.com/yourusername/meta-prompt-evolution-hub
cd meta-prompt-evolution-hub
pip install -e ".[dev]"
from meta_prompt_evolution import EvolutionHub, PromptPopulation
# Initialize evolution hub
hub = EvolutionHub(
population_size=1000,
generations=50,
mutation_rate=0.1,
crossover_rate=0.7
)
# Create initial population
population = PromptPopulation.from_seeds([
"You are a helpful assistant. {task}",
"As an AI assistant, I will {task}",
"Let me help you with {task}"
])
# Define fitness function
def fitness_function(prompt, test_cases):
scores = []
for case in test_cases:
response = llm(prompt.format(task=case.task))
score = evaluate_response(response, case.expected)
scores.append(score)
return np.mean(scores)
# Evolve population
evolved_population = hub.evolve(
population=population,
fitness_fn=fitness_function,
test_cases=test_dataset,
selection_method="tournament"
)
# Get best prompts
best_prompts = evolved_population.get_top_k(10)
from meta_prompt_evolution import ABTestOrchestrator
# Set up A/B testing
ab_tester = ABTestOrchestrator(
production_endpoint="https://api.example.com",
metrics=["accuracy", "latency", "user_satisfaction"],
confidence_level=0.95
)
# Deploy prompt variants
variants = {
"control": current_production_prompt,
"variant_a": evolved_prompts[0],
"variant_b": evolved_prompts[1]
}
ab_tester.deploy_test(
variants=variants,
traffic_split=[0.5, 0.25, 0.25],
duration_hours=24,
min_samples=10000
)
# Monitor results
results = ab_tester.get_results()
if results.variant_a.is_significant_improvement():
ab_tester.promote_to_production("variant_a")
meta-prompt-evolution-hub/
├── meta_prompt_evolution/
│ ├── evolution/
│ │ ├── algorithms/ # Evolutionary algorithms
│ │ ├── operators/ # Mutation, crossover operators
│ │ ├── selection/ # Selection strategies
│ │ └── diversity/ # Diversity maintenance
│ ├── evaluation/
│ │ ├── metrics/ # Evaluation metrics
│ │ ├── benchmarks/ # Standard benchmarks
│ │ ├── distributed/ # Distributed evaluation
│ │ └── caching/ # Result caching
│ ├── optimization/
│ │ ├── multi_objective/# Pareto optimization
│ │ ├── constraints/ # Safety constraints
│ │ └── adaptive/ # Adaptive parameters
│ ├── storage/
│ │ ├── database/ # Prompt database
│ │ ├── versioning/ # Version control
│ │ └── indexing/ # Efficient search
│ ├── deployment/
│ │ ├── ab_testing/ # A/B test orchestration
│ │ ├── monitoring/ # Performance monitoring
│ │ └── rollback/ # Safety mechanisms
│ └── analysis/
│ ├── genealogy/ # Prompt lineage
│ ├── clustering/ # Prompt clustering
│ └── insights/ # Pattern discovery
├── web_ui/ # Web interface
├── cli/ # Command-line tools
└── examples/ # Example configurations
from meta_prompt_evolution.evolution import (
NSGA2, CMA_ES, MAP_Elites,
NoveltySearch, QualityDiversity
)
# Multi-objective optimization with NSGA-II
nsga2 = NSGA2(
objectives=["accuracy", "brevity", "clarity"],
population_size=500
)
pareto_front = nsga2.evolve(
initial_population=population,
generations=100,
mutation_ops=[
"word_substitution",
"sentence_reordering",
"instruction_injection"
]
)
# MAP-Elites for diversity
map_elites = MAP_Elites(
behavior_dimensions=["formality", "specificity", "length"],
grid_resolution=20
)
diverse_prompts = map_elites.evolve(
population=population,
niche_size=10,
generations=200
)
from meta_prompt_evolution.operators import MutationOperator
@MutationOperator.register("semantic_mutation")
class SemanticMutation:
def __init__(self, similarity_threshold=0.8):
self.embedder = SentenceTransformer('all-mpnet-base-v2')
self.threshold = similarity_threshold
def mutate(self, prompt):
# Extract key concepts
concepts = extract_concepts(prompt)
# Find semantically similar replacements
mutations = []
for concept in concepts:
similar = find_similar_terms(concept, self.threshold)
mutated = prompt.replace(concept, random.choice(similar))
mutations.append(mutated)
return random.choice(mutations)
# Use custom operator
hub.add_mutation_operator(
SemanticMutation(),
probability=0.3
)
from meta_prompt_evolution.distributed import DistributedEvaluator
import ray
# Initialize Ray cluster
ray.init(address="ray://head-node:10001")
# Distributed evaluator
evaluator = DistributedEvaluator(
num_workers=100,
batch_size=50,
timeout=30 # seconds per evaluation
)
# Parallel evaluation
@ray.remote
def evaluate_prompt(prompt, test_batch):
results = []
for test in test_batch:
result = run_evaluation(prompt, test)
results.append(result)
return results
# Evaluate entire population
fitness_scores = evaluator.evaluate_population(
population=population,
test_suite=large_test_suite,
cache_results=True
)
from meta_prompt_evolution.evaluation import AsyncEvaluator
# Continuous evaluation pipeline
async_eval = AsyncEvaluator(
models=["gpt-4", "claude-2", "llama-70b"],
rate_limits={"gpt-4": 100, "claude-2": 50, "llama-70b": 200}
)
# Stream evaluations
async def continuous_evolution():
async for generation in hub.evolve_async(population):
# Evaluate in parallel across models
scores = await async_eval.evaluate_generation(generation)
# Update population based on scores
population = await hub.select_next_generation(
generation,
scores,
elitism_rate=0.1
)
# Checkpoint best prompts
if generation.number % 10 == 0:
await hub.checkpoint(generation.best_prompts(100))
from meta_prompt_evolution.storage import PromptDatabase
# Initialize database
db = PromptDatabase(
backend="postgresql",
connection_string="postgresql://localhost/prompts"
)
# Store prompt with metadata
db.store_prompt(
prompt=evolved_prompt,
metadata={
"generation": 42,
"fitness_scores": {"accuracy": 0.95, "latency": 120},
"parent_ids": [parent1.id, parent2.id],
"mutations_applied": ["semantic_shift", "length_reduction"],
"test_results": detailed_results
}
)
# Query prompts
high_performing = db.query(
filters={
"fitness_scores.accuracy": {"$gt": 0.9},
"fitness_scores.latency": {"$lt": 150},
"generation": {"$gte": 30}
},
order_by="fitness_scores.accuracy",
limit=50
)
# Semantic search
similar_prompts = db.semantic_search(
query="explain quantum computing simply",
k=20,
threshold=0.85
)
from meta_prompt_evolution.versioning import PromptVersionControl
# Git-like version control
vcs = PromptVersionControl(db)
# Create branch for experimentation
branch = vcs.create_branch("quantum_explanation_v2")
# Commit evolved prompt
commit = vcs.commit(
prompt=new_prompt,
message="Improved clarity for technical concepts",
parent_commit=previous_commit_hash
)
# Merge successful evolution
if new_prompt.fitness > baseline.fitness:
vcs.merge(
source_branch="quantum_explanation_v2",
target_branch="main",
strategy="fitness_weighted"
)
# Roll back if needed
if production_issues_detected:
vcs.rollback(to_commit=last_stable_commit)
from meta_prompt_evolution.optimization import ParetoOptimizer
# Define objectives
objectives = {
"accuracy": "maximize",
"cost": "minimize",
"latency": "minimize",
"safety_score": "maximize"
}
# Pareto optimization
pareto_opt = ParetoOptimizer(
objectives=objectives,
constraint_functions={
"min_accuracy": lambda x: x["accuracy"] >= 0.8,
"max_cost": lambda x: x["cost"] <= 0.01,
"safety_threshold": lambda x: x["safety_score"] >= 0.95
}
)
# Find Pareto-optimal prompts
pareto_prompts = pareto_opt.optimize(
population=population,
generations=100,
archive_size=1000
)
# Interactive selection
selected_prompt = pareto_opt.interactive_selection(
pareto_prompts,
preference_weights={
"accuracy": 0.4,
"cost": 0.3,
"latency": 0.2,
"safety_score": 0.1
}
)
from meta_prompt_evolution.analysis import GenealogyAnalyzer
analyzer = GenealogyAnalyzer(db)
# Trace prompt lineage
lineage = analyzer.trace_lineage(
prompt_id=best_prompt.id,
generations_back=50
)
# Visualize evolution tree
lineage.visualize(
color_by="fitness_score",
node_size_by="usage_count",
save_path="prompt_evolution_tree.html"
)
# Identify breakthrough mutations
breakthroughs = analyzer.find_breakthrough_moments(
metric="accuracy",
improvement_threshold=0.1
)
for breakthrough in breakthroughs:
print(f"Generation {breakthrough.generation}:")
print(f" Mutation: {breakthrough.mutation}")
print(f" Improvement: {breakthrough.improvement:.2%}")
from meta_prompt_evolution.analysis import PatternMiner
miner = PatternMiner()
# Find successful prompt patterns
patterns = miner.extract_patterns(
successful_prompts=top_1000_prompts,
min_support=0.1,
min_confidence=0.8
)
# Common elements in high-performing prompts
common_elements = miner.find_common_elements(
prompts=high_accuracy_prompts,
element_types=["instructions", "formatting", "examples"]
)
# Cluster prompts by strategy
clusters = miner.cluster_by_strategy(
prompts=all_prompts,
n_clusters=10,
features=["syntax", "semantics", "structure"]
)
from meta_prompt_evolution.deployment import ContinuousDeployment
cd_pipeline = ContinuousDeployment(
staging_env="https://staging.api.com",
production_env="https://api.com",
rollout_strategy="canary"
)
# Automated deployment pipeline
@cd_pipeline.on_new_champion
async def deploy_new_champion(champion_prompt):
# Run safety checks
safety_passed = await cd_pipeline.safety_checks(
prompt=champion_prompt,
checks=["toxicity", "hallucination", "instruction_following"]
)
if not safety_passed:
return False
# Canary deployment
await cd_pipeline.canary_deploy(
prompt=champion_prompt,
initial_traffic=0.01,
ramp_up_hours=24,
success_criteria={
"error_rate": {"max": 0.001},
"p95_latency": {"max": 500},
"user_satisfaction": {"min": 4.5}
}
)
return True
from meta_prompt_evolution.monitoring import PromptMonitor
monitor = PromptMonitor(
metrics_backend="prometheus",
alerting_backend="pagerduty"
)
# Set up monitoring
monitor.track_metrics({
"prompt_performance": ["accuracy", "f1_score", "latency"],
"user_metrics": ["satisfaction", "task_completion", "retry_rate"],
"system_metrics": ["token_usage", "cost", "error_rate"]
})
# Define alerts
monitor.add_alert(
name="performance_degradation",
condition="accuracy < 0.85 for 5 minutes",
severity="high",
action="rollback_to_previous"
)
monitor.add_alert(
name="cost_spike",
condition="hourly_cost > 100",
severity="medium",
action="switch_to_efficient_variant"
)
from meta_prompt_evolution.web import Dashboard
# Launch web dashboard
dashboard = Dashboard(
hub=hub,
port=8080,
auth="oauth2"
)
# Custom visualizations
dashboard.add_visualization(
"evolution_progress",
type="real_time_chart",
metrics=["best_fitness", "population_diversity", "mutation_rate"]
)
dashboard.add_visualization(
"prompt_explorer",
type="interactive_graph",
data="prompt_genealogy"
)
dashboard.launch()
from meta_prompt_evolution.diversity import DiversityMaintainer
# Maintain healthy diversity
diversity_maintainer = DiversityMaintainer(
min_diversity=0.3,
diversity_metric="embedding_distance"
)
# Inject diversity when needed
@hub.on_generation_complete
def maintain_diversity(generation):
diversity = diversity_maintainer.calculate_diversity(generation)
if diversity < diversity_maintainer.min_diversity:
# Introduce novel prompts
novel_prompts = diversity_maintainer.generate_novel_prompts(
num_prompts=50,
methods=["random_generation", "crossover_with_archive", "llm_generation"]
)
generation.inject_prompts(novel_prompts)
from meta_prompt_evolution.evaluation import EfficientEvaluator
# Reduce evaluation costs
efficient_eval = EfficientEvaluator(
surrogate_model="distilgpt2", # Fast approximation
full_model="gpt-4", # Accurate but expensive
surrogate_confidence_threshold=0.9
)
# Adaptive evaluation
def adaptive_fitness(prompt):
# Quick surrogate evaluation
surrogate_score, confidence = efficient_eval.surrogate_eval(prompt)
# Full evaluation only when uncertain
if confidence < efficient_eval.surrogate_confidence_threshold:
return efficient_eval.full_eval(prompt)
return surrogate_score
We welcome contributions! See CONTRIBUTING.md for guidelines.
@software{meta_prompt_evolution_hub,
title={Meta-Prompt-Evolution-Hub: Evolutionary Prompt Optimization at Scale},
author={Daniel Schmidt},
year={2025},
url={https://github.com/danieleschmidt/meta-prompt-evolution-hub}
}
MIT License - see LICENSE for details.
- Evolutionary computation community
- LLM evaluation frameworks
- Open-source contributors