Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"Tumor detection",
"Duplicate Detection",
"Rendered semantic textual similarity",
"Intent classification",
]

TASK_DOMAIN = Literal[
Expand Down
61 changes: 52 additions & 9 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@


MMTEB_CITATION = """@article{enevoldsen2025mmtebmassivemultilingualtext,
title={MMTEB: Massive Multilingual Text Embedding Benchmark},
title={MMTEB: Massive Multilingual Text Embedding Benchmark},
author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff},
publisher = {arXiv},
journal={arXiv preprint arXiv:2502.13595},
year={2025},
url={https://arxiv.org/abs/2502.13595},
url={https://arxiv.org/abs/2502.13595},
doi = {10.48550/arXiv.2502.13595},
}"""

Expand Down Expand Up @@ -220,9 +220,12 @@
"RuBQRetrieval",
# STS
"RUParaPhraserSTS",
"RuSTSBenchmarkSTS",
"STS22",
],
)
+ get_tasks(
tasks=["RuSTSBenchmarkSTS"],
eval_splits=["test"],
),
description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
reference="https://aclanthology.org/2023.eacl-main.148/",
Expand Down Expand Up @@ -1559,13 +1562,13 @@
reference="",
contacts=["gowitheflow-1998", "isaac-chung"],
citation="""@misc{xiao2025miebmassiveimageembedding,
title={MIEB: Massive Image Embedding Benchmark},
title={MIEB: Massive Image Embedding Benchmark},
author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
year={2025},
eprint={2504.10471},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.10471},
url={https://arxiv.org/abs/2504.10471},
}""",
)

Expand All @@ -1590,13 +1593,13 @@
reference="",
contacts=["gowitheflow-1998", "isaac-chung"],
citation="""@misc{xiao2025miebmassiveimageembedding,
title={MIEB: Massive Image Embedding Benchmark},
title={MIEB: Massive Image Embedding Benchmark},
author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
year={2025},
eprint={2504.10471},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.10471},
url={https://arxiv.org/abs/2504.10471},
}""",
)

Expand Down Expand Up @@ -1670,13 +1673,13 @@
reference="",
contacts=["gowitheflow-1998", "isaac-chung"],
citation="""@misc{xiao2025miebmassiveimageembedding,
title={MIEB: Massive Image Embedding Benchmark},
title={MIEB: Massive Image Embedding Benchmark},
author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
year={2025},
eprint={2504.10471},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.10471},
url={https://arxiv.org/abs/2504.10471},
}""",
)

Expand All @@ -1700,3 +1703,43 @@
}""",
contacts=["mehrzadshm"],
)

ENCODECHKA = Benchmark(
name="Encodechka",
tasks=MTEBTasks(
get_tasks(
tasks=[
# PI
"RUParaPhraserSTS",
# SA
"SentiRuEval2016",
# TI
"RuToxicOKMLCUPClassification",
# IA
"InappropriatenessClassificationv2",
# IC, ICX
"RuNLUIntentClassification",
]
)
+
# NLI
get_tasks(tasks=["XNLI"], eval_splits=["test"], languages=["rus-Cyrl"])
# STS
+ get_tasks(
tasks=["RuSTSBenchmarkSTS"],
eval_splits=["validation"],
languages=["rus-Cyrl"],
),
),
description="A benchmark for evaluating text embedding models on Russian data.",
reference="https://github.com/avidale/encodechka",
citation="""@misc{dale_encodechka,
author = "Dale, David",
title = "Russian rating of sentence encoders",
editor = "habr.com",
url = "https://habr.com/ru/articles/669674/",
month = {June},
year = {2022},
note = {[Online; posted 12-June-2022]},
}""",
)
8 changes: 5 additions & 3 deletions mteb/leaderboard/benchmark_selector.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

import gradio as gr

"""
Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks.

Example:
Example:
[
("First Benchmark", dict(value="MTEB(something)", icon="icon_url")),
("Group of Benchmarks",
("Group of Benchmarks",
[
("Second Benchmark", dict(value="MTEB(something)", icon="icon_url")),
("Third Benchmark", dict(value="MTEB(something)", icon="icon_url")),
Expand Down Expand Up @@ -252,7 +254,7 @@ def make_selector(entries: list[tuple[str, dict | list]]) -> tuple[gr.State, gr.
else:
gr.Markdown(f"### **{label}**")
for sub_label, sub_entry in entry:
button = _create_button(
button = _create_button( # noqa: F841
i, sub_label, sub_entry, state, label_to_value, size="md"
)

Expand Down
3 changes: 3 additions & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,12 @@
from .rus.HeadlineClassification import *
from .rus.InappropriatenessClassification import *
from .rus.KinopoiskClassification import *
from .rus.ru_nlu_intent_classification import *
from .rus.ru_toixic_classification_okmlcup import *
from .rus.RuReviewsClassification import *
from .rus.RuSciBenchGRNTIClassification import *
from .rus.RuSciBenchOECDClassification import *
from .rus.senti_ru_eval import *
from .san.SanskritShlokasClassification import *
from .sin.SinhalaNewsClassification import *
from .sin.SinhalaNewsSourceClassification import *
Expand Down
54 changes: 54 additions & 0 deletions mteb/tasks/Classification/rus/InappropriatenessClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,57 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
)


class InappropriatenessClassificationv2(AbsTaskClassification):
metadata = TaskMetadata(
name="InappropriatenessClassificationv2",
dataset={
"path": "mteb/InappropriatenessClassificationv2",
"revision": "698cb161a90150ec46618f714cdd8606cf21a9eb",
},
description="Inappropriateness identification in the form of binary classification",
reference="https://aclanthology.org/2021.bsnlp-1.4",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2006-01-01", "2021-04-01"),
domains=["Web", "Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="cc-by-nc-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{babakov-etal-2021-detecting,
title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation",
author = "Babakov, Nikolay and
Logacheva, Varvara and
Kozlova, Olga and
Semenov, Nikita and
Panchenko, Alexander",
editor = "Babych, Bogdan and
Kanishcheva, Olga and
Nakov, Preslav and
Piskorski, Jakub and
Pivovarova, Lidia and
Starko, Vasyl and
Steinberger, Josef and
Yangarber, Roman and
Marci{\'n}czuk, Micha{\l} and
Pollak, Senja and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Robnik-{\v{S}}ikonja, Marko",
booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing",
month = apr,
year = "2021",
address = "Kiyv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bsnlp-1.4",
pages = "26--36",
abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.",
}""",
prompt="Classify the given message as either sensitive topic or not",
)
50 changes: 50 additions & 0 deletions mteb/tasks/Classification/rus/ru_nlu_intent_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata


class RuNLUIntentClassification(AbsTaskClassification, MultilingualTask):
metadata = TaskMetadata(
name="RuNLUIntentClassification",
dataset={
"path": "mteb/RuNLUIntentClassification",
"revision": "424d0f767aaa5c411e3a529eec04658e5726a39e",
},
description=(
"Contains natural language data for human-robot interaction in home domain which we collected and"
" annotated for evaluating NLU Services/platforms."
),
reference="https://arxiv.org/abs/1903.05566",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs={
"rus-eng": [
"rus-Cyrl",
"rus-Latn",
],
"rus": [
"rus-Cyrl",
],
},
main_score="accuracy",
date=("2019-03-26", "2019-03-26"),
domains=[],
task_subtypes=["Intent classification"],
license="cc-by-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@misc{liu2019benchmarkingnaturallanguageunderstanding,
title={Benchmarking Natural Language Understanding Services for building Conversational Agents},
author={Xingkun Liu and Arash Eshghi and Pawel Swietojanski and Verena Rieser},
year={2019},
eprint={1903.05566},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1903.05566},
}""",
)
33 changes: 33 additions & 0 deletions mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class RuToxicOKMLCUPClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="RuToxicOKMLCUPClassification",
dataset={
"path": "mteb/RuToxicOKMLCUPClassification",
"revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
},
description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
reference="https://cups.online/ru/contests/okmlcup2020",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2015-01-01", "2020-01-01"),
domains=[],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("toxic", "label")
39 changes: 39 additions & 0 deletions mteb/tasks/Classification/rus/senti_ru_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class SentiRuEval2016Classification(AbsTaskClassification):
metadata = TaskMetadata(
name="SentiRuEval2016",
dataset={
"path": "mteb/SentiRuEval2016",
"revision": "8507eab0deef37f040a750afbcb4dba7a7de9c16",
},
description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks "
"and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, "
"and participants’ results.",
reference="https://github.com/mokoron/sentirueval",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2015-01-01", "2016-01-01"),
domains=[],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{loukachevitch2016sentirueval,
title={SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis},
author={Loukachevitch, NV and Rubtsova, Yu V},
booktitle={Computational Linguistics and Intellectual Technologies},
pages={416--426},
year={2016}
}
""",
)
1 change: 1 addition & 0 deletions mteb/tasks/MultiLabelClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
from .multilingual.MultiEURLEXMultilabelClassification import *
from .por.BrazilianToxicTweetsClassification import *
from .rus.CEDRClassification import *
from .rus.ru_toixic_multilabelclassification_okmlcup import *
from .rus.SensitiveTopicsClassification import *
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskMultilabelClassification import (
AbsTaskMultilabelClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class RuToxicOKMLCUPMultilabelClassification(AbsTaskMultilabelClassification):
metadata = TaskMetadata(
name="RuToxicOKMLCUPMultilabelClassification",
dataset={
"path": "mteb/RuToxicOKMLCUPClassification",
"revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
},
description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
reference="https://cups.online/ru/contests/okmlcup2020",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2015-01-01", "2024-01-01"),
domains=[],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("labels", "label")
Loading