Skip to content

Commit e5e7ee4

Browse files
authored
Merge pull request #90 from hydropix/dev
fix other languages field
2 parents 173619d + 215b1e7 commit e5e7ee4

File tree

18 files changed

+94218
-169
lines changed

18 files changed

+94218
-169
lines changed

benchmark/cli.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -454,19 +454,16 @@ def cmd_wiki_publish(args: argparse.Namespace) -> int:
454454
# Step 3: Copy generated files to wiki repo
455455
print(colored("Step 3/4: Copying files to wiki repository...", Colors.CYAN))
456456

457-
# Copy all markdown files
457+
# Remove old subdirectories (now using flat structure)
458+
for old_subdir in ["languages", "models"]:
459+
old_dir = wiki_clone_dir / old_subdir
460+
if old_dir.exists():
461+
shutil.rmtree(old_dir)
462+
463+
# Copy all markdown files (flat structure)
458464
for md_file in wiki_output_dir.glob("*.md"):
459465
shutil.copy2(md_file, wiki_clone_dir / md_file.name)
460466

461-
# Copy subdirectories (languages, models)
462-
for subdir in ["languages", "models"]:
463-
src_dir = wiki_output_dir / subdir
464-
dst_dir = wiki_clone_dir / subdir
465-
if src_dir.exists():
466-
if dst_dir.exists():
467-
shutil.rmtree(dst_dir)
468-
shutil.copytree(src_dir, dst_dir)
469-
470467
print(colored("Files copied.", Colors.GREEN))
471468

472469
# Step 4: Commit and push

benchmark/translator.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -101,22 +101,27 @@ def _build_prompt(
101101
"""
102102
system_prompt = f"""You are a professional {target_language} translator and writer.
103103
104-
# CRITICAL: TARGET LANGUAGE IS {target_language.upper()}
104+
# TRANSLATION PRINCIPLES
105105
106-
**YOUR TRANSLATION MUST BE WRITTEN ENTIRELY IN {target_language.upper()}.**
106+
Translate {source_language} to {target_language}. Output only the translation.
107107
108-
You are translating FROM {source_language} TO {target_language}.
109-
Your output must be in {target_language} ONLY - do NOT use any other language.
108+
**PRIORITY ORDER:**
109+
1. Preserve exact names
110+
2. Match original tone and formality
111+
3. Use natural {target_language} phrasing - never word-for-word
112+
4. Fix grammar/spelling errors in output
113+
5. Translate idioms to {target_language} equivalents
114+
6. Preserve the author's literary style and emotional impact
110115
111-
# TRANSLATION PRINCIPLES
116+
**QUALITY CHECK:**
117+
- Does it sound natural to a native {target_language} speaker?
118+
- Are all details from the original included?
119+
- Does punctuation follow {target_language} conventions?
112120
113-
**Quality Standards:**
114-
- Translate faithfully while preserving the author's literary style, tone, and voice
115-
- Maintain the original meaning
116-
- Restructure sentences naturally in {target_language} (avoid word-by-word translation)
117-
- Adapt cultural references, idioms, and expressions to {target_language} context
118-
- Keep period-appropriate language when translating historical or classical texts
119-
- Preserve the emotional impact and atmosphere of the original
121+
If unsure between literal and natural phrasing: **choose natural**.
122+
123+
**LAYOUT PRESERVATION:**
124+
- Keep the exact text layout, spacing, line breaks, and indentation
120125
- **WRITE YOUR TRANSLATION IN {target_language.upper()} - THIS IS MANDATORY**
121126
122127
# FINAL REMINDER: YOUR OUTPUT LANGUAGE
@@ -134,12 +139,14 @@ def _build_prompt(
134139
4. Do NOT add explanations, comments, notes, or greetings
135140
136141
**INCORRECT examples (DO NOT do this):**
137-
- "Here is the translation: {TRANSLATE_TAG_IN}Text...{TRANSLATE_TAG_OUT}"
138-
- "{TRANSLATE_TAG_IN}Text...{TRANSLATE_TAG_OUT} (Additional comment)"
139-
- "Sure! {TRANSLATE_TAG_IN}Text...{TRANSLATE_TAG_OUT}"
142+
❌ "Here is the translation: {TRANSLATE_TAG_IN}Text...{TRANSLATE_TAG_OUT}"
143+
❌ "{TRANSLATE_TAG_IN}Text...{TRANSLATE_TAG_OUT} (Additional comment)"
144+
❌ "Sure! {TRANSLATE_TAG_IN}Text...{TRANSLATE_TAG_OUT}"
145+
❌ "Text..." (missing tags entirely)
146+
❌ "{TRANSLATE_TAG_IN}Text..." (missing closing tag)
140147
141148
**CORRECT format (ONLY this):**
142-
{TRANSLATE_TAG_IN}
149+
{TRANSLATE_TAG_IN}
143150
Your translated text here
144151
{TRANSLATE_TAG_OUT}"""
145152

benchmark/wiki/generator.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,21 @@ def _get_language_info(self, code: str) -> dict:
152152
}
153153

154154
def _slugify(self, text: str) -> str:
155-
"""Convert text to URL-safe slug."""
155+
"""Convert text to URL-safe slug for GitHub wiki page names."""
156156
slug = text.lower()
157157
slug = re.sub(r"[^a-z0-9\-_]", "-", slug)
158158
slug = re.sub(r"-+", "-", slug)
159159
slug = slug.strip("-")
160160
return slug
161161

162+
def _language_page_name(self, language_name: str) -> str:
163+
"""Generate wiki page name for a language (flat structure for GitHub wiki)."""
164+
return f"Language-{self._slugify(language_name)}"
165+
166+
def _model_page_name(self, model_name: str) -> str:
167+
"""Generate wiki page name for a model (flat structure for GitHub wiki)."""
168+
return f"Model-{self._slugify(model_name)}"
169+
162170
def _calculate_score_distribution(self, scores: list[float]) -> dict:
163171
"""Calculate score distribution buckets."""
164172
dist = {"excellent": 0, "good": 0, "acceptable": 0, "poor": 0, "failed": 0}
@@ -194,10 +202,8 @@ def generate_all(self, run_id: Optional[str] = None) -> Path:
194202
if run is None:
195203
raise ValueError("No benchmark run found")
196204

197-
# Ensure output directories exist
205+
# Ensure output directory exists (flat structure for GitHub wiki)
198206
self.output_dir.mkdir(parents=True, exist_ok=True)
199-
(self.output_dir / "languages").mkdir(exist_ok=True)
200-
(self.output_dir / "models").mkdir(exist_ok=True)
201207

202208
# Generate all pages
203209
self._generate_home(run)
@@ -218,7 +224,7 @@ def _generate_home(self, run: BenchmarkRun) -> None:
218224
for stats in sorted(model_stats, key=lambda x: x.avg_overall, reverse=True):
219225
model_rankings.append({
220226
"name": stats.model,
221-
"slug": self._slugify(stats.model),
227+
"page_name": self._model_page_name(stats.model),
222228
"avg_overall": stats.avg_overall,
223229
"avg_accuracy": stats.avg_accuracy,
224230
"avg_fluency": stats.avg_fluency,
@@ -235,7 +241,7 @@ def _generate_home(self, run: BenchmarkRun) -> None:
235241
language_rankings.append({
236242
"name": lang_info["name"],
237243
"native_name": lang_info["native_name"],
238-
"slug": self._slugify(lang_info["name"]),
244+
"page_name": self._language_page_name(lang_info["name"]),
239245
"avg_overall": stats.avg_overall,
240246
"indicator": get_score_indicator(stats.avg_overall),
241247
"best_model": stats.best_model or "N/A",
@@ -269,16 +275,17 @@ def _generate_all_languages_page(self, run: BenchmarkRun) -> None:
269275
for stats in sorted(language_stats, key=lambda x: x.avg_overall, reverse=True):
270276
lang_info = self._get_language_info(stats.language_code)
271277
indicator = get_score_indicator(stats.avg_overall)
278+
page_name = self._language_page_name(lang_info['name'])
272279
rows.append([
273-
f"[{lang_info['name']}](languages/{self._slugify(lang_info['name'])})",
280+
f"[{lang_info['name']}]({page_name})",
274281
lang_info['native_name'],
275282
lang_info['category'],
276283
f"{indicator} {stats.avg_overall:.1f}",
277284
stats.best_model or "N/A",
278285
])
279286

280287
table = format_markdown_table(headers, rows)
281-
content = f"# All Languages\n\n{table}\n\n---\n\n[< Back to Home](Home)\n"
288+
content = f"# All Languages\n\n{table}\n\n---\n\n[ Back to Home](Home)\n"
282289

283290
(self.output_dir / "All-Languages.md").write_text(content, encoding="utf-8")
284291

@@ -291,8 +298,9 @@ def _generate_all_models_page(self, run: BenchmarkRun) -> None:
291298

292299
for stats in sorted(model_stats, key=lambda x: x.avg_overall, reverse=True):
293300
indicator = get_score_indicator(stats.avg_overall)
301+
page_name = self._model_page_name(stats.model)
294302
rows.append([
295-
f"[{stats.model}](models/{self._slugify(stats.model)})",
303+
f"[{stats.model}]({page_name})",
296304
f"{indicator} {stats.avg_overall:.1f}",
297305
f"{stats.avg_accuracy:.1f}",
298306
f"{stats.avg_fluency:.1f}",
@@ -301,7 +309,7 @@ def _generate_all_models_page(self, run: BenchmarkRun) -> None:
301309
])
302310

303311
table = format_markdown_table(headers, rows)
304-
content = f"# All Models\n\n{table}\n\n---\n\n[< Back to Home](Home)\n"
312+
content = f"# All Models\n\n{table}\n\n---\n\n[ Back to Home](Home)\n"
305313

306314
(self.output_dir / "All-Models.md").write_text(content, encoding="utf-8")
307315

@@ -340,7 +348,7 @@ def _generate_language_pages(self, run: BenchmarkRun) -> None:
340348
if m_scores:
341349
model_results.append({
342350
"model": model,
343-
"model_slug": self._slugify(model),
351+
"model_page_name": self._model_page_name(model),
344352
"avg_overall": sum(s.overall for s in m_scores) / len(m_scores),
345353
"avg_accuracy": sum(s.accuracy for s in m_scores) / len(m_scores),
346354
"avg_fluency": sum(s.fluency for s in m_scores) / len(m_scores),
@@ -367,16 +375,17 @@ def _generate_language_pages(self, run: BenchmarkRun) -> None:
367375
total_translations=len(results),
368376
model_results=model_results,
369377
best_model=best_model,
370-
best_model_slug=self._slugify(best_model),
378+
best_model_page_name=self._model_page_name(best_model),
371379
worst_model=worst_model,
372-
worst_model_slug=self._slugify(worst_model),
380+
worst_model_page_name=self._model_page_name(worst_model),
373381
examples=examples,
374382
score_dist=self._calculate_score_distribution(scores),
375383
indicators=self.INDICATORS,
376384
)
377385

378-
filename = f"{self._slugify(lang_info['name'])}.md"
379-
(self.output_dir / "languages" / filename).write_text(content, encoding="utf-8")
386+
# Write to flat directory structure (GitHub wiki doesn't support subdirectories)
387+
filename = f"{self._language_page_name(lang_info['name'])}.md"
388+
(self.output_dir / filename).write_text(content, encoding="utf-8")
380389

381390
def _generate_model_pages(self, run: BenchmarkRun) -> None:
382391
"""Generate individual model pages."""
@@ -414,7 +423,7 @@ def _generate_model_pages(self, run: BenchmarkRun) -> None:
414423
language_results.append({
415424
"code": lang_code,
416425
"name": lang_info["name"],
417-
"slug": self._slugify(lang_info["name"]),
426+
"page_name": self._language_page_name(lang_info["name"]),
418427
"category": lang_info["category"],
419428
"avg_overall": lang_avg,
420429
"avg_accuracy": sum(s.accuracy for s in l_scores) / len(l_scores),
@@ -453,10 +462,10 @@ def _generate_model_pages(self, run: BenchmarkRun) -> None:
453462
language_results=language_results,
454463
categories=categories,
455464
best_language=best_lang["name"] if best_lang else "N/A",
456-
best_language_slug=self._slugify(best_lang["name"]) if best_lang else "",
465+
best_language_page_name=self._language_page_name(best_lang["name"]) if best_lang else "",
457466
best_language_score=best_lang["avg_overall"] if best_lang else 0,
458467
worst_language=worst_lang["name"] if worst_lang else "N/A",
459-
worst_language_slug=self._slugify(worst_lang["name"]) if worst_lang else "",
468+
worst_language_page_name=self._language_page_name(worst_lang["name"]) if worst_lang else "",
460469
worst_language_score=worst_lang["avg_overall"] if worst_lang else 0,
461470
best_example=best_example,
462471
worst_example=worst_example,
@@ -465,8 +474,9 @@ def _generate_model_pages(self, run: BenchmarkRun) -> None:
465474
avg_translation_time_ms=avg_translation_time,
466475
)
467476

468-
filename = f"{self._slugify(model_name)}.md"
469-
(self.output_dir / "models" / filename).write_text(content, encoding="utf-8")
477+
# Write to flat directory structure (GitHub wiki doesn't support subdirectories)
478+
filename = f"{self._model_page_name(model_name)}.md"
479+
(self.output_dir / filename).write_text(content, encoding="utf-8")
470480

471481
def _group_languages_by_category(self, language_rankings: list[dict]) -> list[dict]:
472482
"""Group language rankings by category."""

benchmark/wiki/templates/home.md.j2

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Overall performance across all tested languages:
2323
| Rank | Model | Avg Score | Accuracy | Fluency | Style | Languages Tested |
2424
|------|-------|-----------|----------|---------|-------|------------------|
2525
{% for model in model_rankings %}
26-
| {{ loop.index }} | [{{ model.name }}](models/{{ model.slug }}) | {{ model.indicator }} {{ model.avg_overall|round(1) }} | {{ model.avg_accuracy|round(1) }} | {{ model.avg_fluency|round(1) }} | {{ model.avg_style|round(1) }} | {{ model.languages_tested }} |
26+
| {{ loop.index }} | [{{ model.name }}]({{ model.page_name }}) | {{ model.indicator }} {{ model.avg_overall|round(1) }} | {{ model.avg_accuracy|round(1) }} | {{ model.avg_fluency|round(1) }} | {{ model.avg_style|round(1) }} | {{ model.languages_tested }} |
2727
{% endfor %}
2828

2929
---
@@ -35,7 +35,7 @@ Best translation quality by target language:
3535
| Rank | Language | Native | Avg Score | Best Model | Tests |
3636
|------|----------|--------|-----------|------------|-------|
3737
{% for lang in language_rankings[:15] %}
38-
| {{ loop.index }} | [{{ lang.name }}](languages/{{ lang.slug }}) | {{ lang.native_name }} | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.best_model }} | {{ lang.total_translations }} |
38+
| {{ loop.index }} | [{{ lang.name }}]({{ lang.page_name }}) | {{ lang.native_name }} | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.best_model }} | {{ lang.total_translations }} |
3939
{% endfor %}
4040

4141
{% if language_rankings|length > 15 %}
@@ -64,7 +64,7 @@ Best translation quality by target language:
6464
| Language | Avg Score | Best Model |
6565
|----------|-----------|------------|
6666
{% for lang in category.languages %}
67-
| [{{ lang.name }}](languages/{{ lang.slug }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.best_model }} |
67+
| [{{ lang.name }}]({{ lang.page_name }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.best_model }} |
6868
{% endfor %}
6969

7070
{% endfor %}

benchmark/wiki/templates/language.md.j2

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
| **Fluency** | {{ avg_fluency|round(1) }}/10 |
1616
| **Style** | {{ avg_style|round(1) }}/10 |
1717
| **Total Tests** | {{ total_translations }} |
18-
| **Best Model** | [{{ best_model }}](../models/{{ best_model_slug }}) |
19-
| **Worst Model** | [{{ worst_model }}](../models/{{ worst_model_slug }}) |
18+
| **Best Model** | [{{ best_model }}]({{ best_model_page_name }}) |
19+
| **Worst Model** | [{{ worst_model }}]({{ worst_model_page_name }}) |
2020

2121
---
2222

@@ -25,7 +25,7 @@
2525
| Model | Overall | Accuracy | Fluency | Style |
2626
|-------|---------|----------|---------|-------|
2727
{% for result in model_results %}
28-
| [{{ result.model }}](../models/{{ result.model_slug }}) | {{ result.indicator }} {{ result.avg_overall|round(1) }} | {{ result.avg_accuracy|round(1) }} | {{ result.avg_fluency|round(1) }} | {{ result.avg_style|round(1) }} |
28+
| [{{ result.model }}]({{ result.model_page_name }}) | {{ result.indicator }} {{ result.avg_overall|round(1) }} | {{ result.avg_accuracy|round(1) }} | {{ result.avg_fluency|round(1) }} | {{ result.avg_style|round(1) }} |
2929
{% endfor %}
3030

3131
---
@@ -70,4 +70,4 @@
7070

7171
---
7272

73-
[< Back to Home](../Home) | [All Languages](../All-Languages)
73+
[ Back to Home](Home) | [All Languages](All-Languages)

benchmark/wiki/templates/model.md.j2

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
| **Style** | {{ avg_style|round(1) }}/10 |
1515
| **Languages Tested** | {{ total_languages }} |
1616
| **Total Translations** | {{ total_translations }} |
17-
| **Best Language** | [{{ best_language }}](../languages/{{ best_language_slug }}) ({{ best_language_score|round(1) }}) |
18-
| **Worst Language** | [{{ worst_language }}](../languages/{{ worst_language_slug }}) ({{ worst_language_score|round(1) }}) |
17+
| **Best Language** | [{{ best_language }}]({{ best_language_page_name }}) ({{ best_language_score|round(1) }}) |
18+
| **Worst Language** | [{{ worst_language }}]({{ worst_language_page_name }}) ({{ worst_language_score|round(1) }}) |
1919

2020
---
2121

@@ -26,7 +26,7 @@
2626
| Rank | Language | Overall | Accuracy | Fluency | Style |
2727
|------|----------|---------|----------|---------|-------|
2828
{% for lang in language_results[:10] %}
29-
| {{ loop.index }} | [{{ lang.name }}](../languages/{{ lang.slug }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.avg_accuracy|round(1) }} | {{ lang.avg_fluency|round(1) }} | {{ lang.avg_style|round(1) }} |
29+
| {{ loop.index }} | [{{ lang.name }}]({{ lang.page_name }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.avg_accuracy|round(1) }} | {{ lang.avg_fluency|round(1) }} | {{ lang.avg_style|round(1) }} |
3030
{% endfor %}
3131

3232
{% if language_results|length > 10 %}
@@ -36,7 +36,7 @@
3636
| Rank | Language | Overall | Accuracy | Fluency | Style |
3737
|------|----------|---------|----------|---------|-------|
3838
{% for lang in language_results %}
39-
| {{ loop.index }} | [{{ lang.name }}](../languages/{{ lang.slug }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.avg_accuracy|round(1) }} | {{ lang.avg_fluency|round(1) }} | {{ lang.avg_style|round(1) }} |
39+
| {{ loop.index }} | [{{ lang.name }}]({{ lang.page_name }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.avg_accuracy|round(1) }} | {{ lang.avg_fluency|round(1) }} | {{ lang.avg_style|round(1) }} |
4040
{% endfor %}
4141

4242
</details>
@@ -52,7 +52,7 @@
5252
| Language | Overall | Accuracy | Fluency | Style |
5353
|----------|---------|----------|---------|-------|
5454
{% for lang in category.languages %}
55-
| [{{ lang.name }}](../languages/{{ lang.slug }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.avg_accuracy|round(1) }} | {{ lang.avg_fluency|round(1) }} | {{ lang.avg_style|round(1) }} |
55+
| [{{ lang.name }}]({{ lang.page_name }}) | {{ lang.indicator }} {{ lang.avg_overall|round(1) }} | {{ lang.avg_accuracy|round(1) }} | {{ lang.avg_fluency|round(1) }} | {{ lang.avg_style|round(1) }} |
5656
{% endfor %}
5757

5858
**Category Average:** {{ category.indicator }} {{ category.avg_overall|round(1) }}
@@ -130,4 +130,4 @@
130130

131131
---
132132

133-
[< Back to Home](../Home) | [All Models](../All-Models)
133+
[ Back to Home](Home) | [All Models](All-Models)

0 commit comments

Comments
 (0)