|
| 1 | +""" |
| 2 | +Dynamic example generator for missing language pairs. |
| 3 | +
|
| 4 | +DEPRECATED: This module is superseded by prompts/examples/technical_generator.py |
| 5 | +which provides a unified generator for both placeholder and image examples. |
| 6 | +This file is kept for backwards compatibility with existing cache files. |
| 7 | +
|
| 8 | +This module generates placeholder preservation examples on-demand using the |
| 9 | +configured LLM provider, with persistent file-based caching. |
| 10 | +""" |
| 11 | + |
| 12 | +import asyncio |
| 13 | +import json |
| 14 | +import os |
| 15 | +from pathlib import Path |
| 16 | +from typing import Dict, Optional, Tuple, Any |
| 17 | + |
| 18 | +from src.config import create_placeholder |
| 19 | + |
| 20 | +# Generate placeholders using actual config |
| 21 | +TAG0 = create_placeholder(0) |
| 22 | +TAG1 = create_placeholder(1) |
| 23 | + |
| 24 | +# Cache file location (same directory as this module) |
| 25 | +CACHE_FILE = Path(__file__).parent / "examples_cache.json" |
| 26 | + |
| 27 | +# Source sentence template (English) - simple and universal |
| 28 | +SOURCE_TEMPLATE = f"This is {TAG0}very important{TAG1} information" |
| 29 | + |
| 30 | + |
| 31 | +def _load_cache() -> Dict[str, Dict[str, str]]: |
| 32 | + """Load cached examples from file.""" |
| 33 | + if CACHE_FILE.exists(): |
| 34 | + try: |
| 35 | + with open(CACHE_FILE, "r", encoding="utf-8") as f: |
| 36 | + return json.load(f) |
| 37 | + except (json.JSONDecodeError, IOError): |
| 38 | + return {} |
| 39 | + return {} |
| 40 | + |
| 41 | + |
| 42 | +def _save_cache(cache: Dict[str, Dict[str, str]]) -> None: |
| 43 | + """Save examples cache to file.""" |
| 44 | + try: |
| 45 | + with open(CACHE_FILE, "w", encoding="utf-8") as f: |
| 46 | + json.dump(cache, f, ensure_ascii=False, indent=2) |
| 47 | + except IOError as e: |
| 48 | + print(f"[WARNING] Could not save examples cache: {e}") |
| 49 | + |
| 50 | + |
| 51 | +def _get_cache_key(source_lang: str, target_lang: str) -> str: |
| 52 | + """Generate cache key for a language pair.""" |
| 53 | + return f"{source_lang.lower()}:{target_lang.lower()}" |
| 54 | + |
| 55 | + |
| 56 | +def get_cached_example( |
| 57 | + source_lang: str, |
| 58 | + target_lang: str |
| 59 | +) -> Optional[Dict[str, str]]: |
| 60 | + """ |
| 61 | + Get a cached example for a language pair. |
| 62 | +
|
| 63 | + Returns: |
| 64 | + Dict with "source", "correct", "wrong" or None if not cached. |
| 65 | + """ |
| 66 | + cache = _load_cache() |
| 67 | + key = _get_cache_key(source_lang, target_lang) |
| 68 | + return cache.get(key) |
| 69 | + |
| 70 | + |
| 71 | +def save_generated_example( |
| 72 | + source_lang: str, |
| 73 | + target_lang: str, |
| 74 | + example: Dict[str, str] |
| 75 | +) -> None: |
| 76 | + """Save a generated example to the cache.""" |
| 77 | + cache = _load_cache() |
| 78 | + key = _get_cache_key(source_lang, target_lang) |
| 79 | + cache[key] = example |
| 80 | + _save_cache(cache) |
| 81 | + |
| 82 | + |
| 83 | +def build_generation_prompt(source_lang: str, target_lang: str) -> str: |
| 84 | + """ |
| 85 | + Build a simple prompt to generate a translation example. |
| 86 | +
|
| 87 | + The prompt is designed to be simple enough for any LLM to handle correctly. |
| 88 | + """ |
| 89 | + return f"""Translate this sentence from {source_lang} to {target_lang}. |
| 90 | +
|
| 91 | +CRITICAL: Keep {TAG0} and {TAG1} EXACTLY as they are. Do NOT remove or modify them. |
| 92 | +
|
| 93 | +Sentence: {SOURCE_TEMPLATE} |
| 94 | +
|
| 95 | +Reply with ONLY the translated sentence, nothing else.""" |
| 96 | + |
| 97 | + |
| 98 | +async def generate_example_async( |
| 99 | + source_lang: str, |
| 100 | + target_lang: str, |
| 101 | + provider: Any # LLMProvider instance |
| 102 | +) -> Optional[Dict[str, str]]: |
| 103 | + """ |
| 104 | + Generate a placeholder example using the LLM provider. |
| 105 | +
|
| 106 | + Args: |
| 107 | + source_lang: Source language name |
| 108 | + target_lang: Target language name |
| 109 | + provider: An LLMProvider instance (OllamaProvider, GeminiProvider, etc.) |
| 110 | +
|
| 111 | + Returns: |
| 112 | + Dict with "source", "correct", "wrong" examples, or None if generation failed. |
| 113 | + """ |
| 114 | + prompt = build_generation_prompt(source_lang, target_lang) |
| 115 | + |
| 116 | + try: |
| 117 | + # Use a short timeout for this simple task |
| 118 | + response = await provider.generate(prompt, timeout=30) |
| 119 | + |
| 120 | + if not response: |
| 121 | + return None |
| 122 | + |
| 123 | + # Clean up the response |
| 124 | + translated = response.strip() |
| 125 | + |
| 126 | + # Remove any quotes if the LLM wrapped the response |
| 127 | + if translated.startswith('"') and translated.endswith('"'): |
| 128 | + translated = translated[1:-1] |
| 129 | + if translated.startswith("'") and translated.endswith("'"): |
| 130 | + translated = translated[1:-1] |
| 131 | + |
| 132 | + # Validate that placeholders are preserved |
| 133 | + if TAG0 not in translated or TAG1 not in translated: |
| 134 | + print(f"[WARNING] LLM did not preserve placeholders for {source_lang}->{target_lang}") |
| 135 | + return None |
| 136 | + |
| 137 | + # Create the "wrong" example by removing placeholders |
| 138 | + wrong = translated.replace(TAG0, "").replace(TAG1, "") |
| 139 | + # Clean up any double spaces |
| 140 | + wrong = " ".join(wrong.split()) |
| 141 | + |
| 142 | + example = { |
| 143 | + "source": SOURCE_TEMPLATE, |
| 144 | + "correct": translated, |
| 145 | + "wrong": wrong |
| 146 | + } |
| 147 | + |
| 148 | + # Cache the result |
| 149 | + save_generated_example(source_lang, target_lang, example) |
| 150 | + |
| 151 | + return example |
| 152 | + |
| 153 | + except Exception as e: |
| 154 | + print(f"[WARNING] Failed to generate example for {source_lang}->{target_lang}: {e}") |
| 155 | + return None |
| 156 | + |
| 157 | + |
| 158 | +def generate_example_sync( |
| 159 | + source_lang: str, |
| 160 | + target_lang: str, |
| 161 | + provider: Any |
| 162 | +) -> Optional[Dict[str, str]]: |
| 163 | + """ |
| 164 | + Synchronous wrapper for generate_example_async. |
| 165 | +
|
| 166 | + Use this when calling from synchronous code. |
| 167 | + """ |
| 168 | + try: |
| 169 | + loop = asyncio.get_event_loop() |
| 170 | + if loop.is_running(): |
| 171 | + # If we're already in an async context, create a new task |
| 172 | + import concurrent.futures |
| 173 | + with concurrent.futures.ThreadPoolExecutor() as executor: |
| 174 | + future = executor.submit( |
| 175 | + asyncio.run, |
| 176 | + generate_example_async(source_lang, target_lang, provider) |
| 177 | + ) |
| 178 | + return future.result(timeout=60) |
| 179 | + else: |
| 180 | + return loop.run_until_complete( |
| 181 | + generate_example_async(source_lang, target_lang, provider) |
| 182 | + ) |
| 183 | + except Exception as e: |
| 184 | + print(f"[WARNING] Sync generation failed for {source_lang}->{target_lang}: {e}") |
| 185 | + return None |
| 186 | + |
| 187 | + |
| 188 | +async def ensure_example_exists( |
| 189 | + source_lang: str, |
| 190 | + target_lang: str, |
| 191 | + provider: Any, |
| 192 | + static_examples: Dict[Tuple[str, str], Dict[str, str]] |
| 193 | +) -> Tuple[Dict[str, str], str, str]: |
| 194 | + """ |
| 195 | + Ensure an example exists for the language pair. |
| 196 | +
|
| 197 | + Checks in order: |
| 198 | + 1. Static examples (from examples.py) |
| 199 | + 2. Cached generated examples |
| 200 | + 3. Generate new example with LLM |
| 201 | + 4. Fallback to English->Chinese |
| 202 | +
|
| 203 | + Args: |
| 204 | + source_lang: Source language name |
| 205 | + target_lang: Target language name |
| 206 | + provider: LLMProvider instance |
| 207 | + static_examples: PLACEHOLDER_EXAMPLES dict from examples.py |
| 208 | +
|
| 209 | + Returns: |
| 210 | + Tuple of (example_dict, actual_source_lang, actual_target_lang) |
| 211 | + """ |
| 212 | + key = (source_lang.lower(), target_lang.lower()) |
| 213 | + |
| 214 | + # 1. Check static examples |
| 215 | + if key in static_examples: |
| 216 | + return static_examples[key], source_lang, target_lang |
| 217 | + |
| 218 | + # 2. Check cache |
| 219 | + cached = get_cached_example(source_lang, target_lang) |
| 220 | + if cached: |
| 221 | + return cached, source_lang, target_lang |
| 222 | + |
| 223 | + # 3. Try to generate with LLM |
| 224 | + if provider: |
| 225 | + generated = await generate_example_async(source_lang, target_lang, provider) |
| 226 | + if generated: |
| 227 | + print(f"[INFO] Generated placeholder example for {source_lang}->{target_lang}") |
| 228 | + return generated, source_lang, target_lang |
| 229 | + |
| 230 | + # 4. Fallback chain: try English as source, then source to English |
| 231 | + fallback_key = ("english", target_lang.lower()) |
| 232 | + if fallback_key in static_examples: |
| 233 | + return static_examples[fallback_key], "English", target_lang |
| 234 | + |
| 235 | + fallback_key = (source_lang.lower(), "english") |
| 236 | + if fallback_key in static_examples: |
| 237 | + return static_examples[fallback_key], source_lang, "English" |
| 238 | + |
| 239 | + # 5. Ultimate fallback |
| 240 | + return static_examples[("english", "chinese")], "English", "Chinese" |
0 commit comments