embeddings-benchmark · Samoed · Apr 27, 2025 · Apr 19, 2025 · Apr 19, 2025 · Apr 19, 2025
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -53,6 +53,7 @@
     "Tumor detection",
     "Duplicate Detection",
     "Rendered semantic textual similarity",
+    "Intent classification",
 ]
 
 TASK_DOMAIN = Literal[

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -17,12 +17,12 @@
 
 
 MMTEB_CITATION = """@article{enevoldsen2025mmtebmassivemultilingualtext,
-    title={MMTEB: Massive Multilingual Text Embedding Benchmark}, 
+    title={MMTEB: Massive Multilingual Text Embedding Benchmark},
     author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff},
     publisher = {arXiv},
     journal={arXiv preprint arXiv:2502.13595},
     year={2025},
-    url={https://arxiv.org/abs/2502.13595}, 
+    url={https://arxiv.org/abs/2502.13595},
     doi = {10.48550/arXiv.2502.13595},
 }"""
 
@@ -220,9 +220,12 @@
             "RuBQRetrieval",
             # STS
             "RUParaPhraserSTS",
-            "RuSTSBenchmarkSTS",
             "STS22",
         ],
+    )
+    + get_tasks(
+        tasks=["RuSTSBenchmarkSTS"],
+        eval_splits=["test"],
     ),
     description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
     reference="https://aclanthology.org/2023.eacl-main.148/",
@@ -1559,13 +1562,13 @@
     reference="",
     contacts=["gowitheflow-1998", "isaac-chung"],
     citation="""@misc{xiao2025miebmassiveimageembedding,
-      title={MIEB: Massive Image Embedding Benchmark}, 
+      title={MIEB: Massive Image Embedding Benchmark},
       author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
       year={2025},
       eprint={2504.10471},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2504.10471}, 
+      url={https://arxiv.org/abs/2504.10471},
     }""",
 )
 
@@ -1590,13 +1593,13 @@
     reference="",
     contacts=["gowitheflow-1998", "isaac-chung"],
     citation="""@misc{xiao2025miebmassiveimageembedding,
-      title={MIEB: Massive Image Embedding Benchmark}, 
+      title={MIEB: Massive Image Embedding Benchmark},
       author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
       year={2025},
       eprint={2504.10471},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2504.10471}, 
+      url={https://arxiv.org/abs/2504.10471},
     }""",
 )
 
@@ -1670,13 +1673,13 @@
     reference="",
     contacts=["gowitheflow-1998", "isaac-chung"],
     citation="""@misc{xiao2025miebmassiveimageembedding,
-      title={MIEB: Massive Image Embedding Benchmark}, 
+      title={MIEB: Massive Image Embedding Benchmark},
       author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
       year={2025},
       eprint={2504.10471},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2504.10471}, 
+      url={https://arxiv.org/abs/2504.10471},
     }""",
 )
 
@@ -1700,3 +1703,43 @@
 }""",
     contacts=["mehrzadshm"],
 )
+
+ENCODECHKA = Benchmark(
+    name="Encodechka",
+    tasks=MTEBTasks(
+        get_tasks(
+            tasks=[
+                # PI
+                "RUParaPhraserSTS",
+                # SA
+                "SentiRuEval2016",
+                # TI
+                "RuToxicOKMLCUPClassification",
+                # IA
+                "InappropriatenessClassificationv2",
+                # IC, ICX
+                "RuNLUIntentClassification",
+            ]
+        )
+        +
+        # NLI
+        get_tasks(tasks=["XNLI"], eval_splits=["test"], languages=["rus-Cyrl"])
+        # STS
+        + get_tasks(
+            tasks=["RuSTSBenchmarkSTS"],
+            eval_splits=["validation"],
+            languages=["rus-Cyrl"],
+        ),
+    ),
+    description="A benchmark for evaluating text embedding models on Russian data.",
+    reference="https://github.com/avidale/encodechka",
+    citation="""@misc{dale_encodechka,
+   author = "Dale, David",
+   title  = "Russian rating of sentence encoders",
+   editor = "habr.com",
+   url    = "https://habr.com/ru/articles/669674/",
+   month  = {June},
+   year   = {2022},
+   note = {[Online; posted 12-June-2022]},
+}""",
+)
diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py
@@ -1,12 +1,14 @@
+from __future__ import annotations
+
 import gradio as gr
 
 """
 Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks.
 
-Example: 
+Example:
 [
     ("First Benchmark", dict(value="MTEB(something)", icon="icon_url")),
-    ("Group of Benchmarks", 
+    ("Group of Benchmarks",
         [
             ("Second Benchmark", dict(value="MTEB(something)", icon="icon_url")),
             ("Third Benchmark", dict(value="MTEB(something)", icon="icon_url")),
@@ -252,7 +254,7 @@ def make_selector(entries: list[tuple[str, dict | list]]) -> tuple[gr.State, gr.
             else:
                 gr.Markdown(f"### **{label}**")
                 for sub_label, sub_entry in entry:
-                    button = _create_button(
+                    button = _create_button(  # noqa: F841
                         i, sub_label, sub_entry, state, label_to_value, size="md"
                     )
 

diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
@@ -129,9 +129,12 @@
 from .rus.HeadlineClassification import *
 from .rus.InappropriatenessClassification import *
 from .rus.KinopoiskClassification import *
+from .rus.ru_nlu_intent_classification import *
+from .rus.ru_toixic_classification_okmlcup import *
 from .rus.RuReviewsClassification import *
 from .rus.RuSciBenchGRNTIClassification import *
 from .rus.RuSciBenchOECDClassification import *
+from .rus.senti_ru_eval import *
 from .san.SanskritShlokasClassification import *
 from .sin.SinhalaNewsClassification import *
 from .sin.SinhalaNewsSourceClassification import *

diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py
@@ -61,3 +61,57 @@ def dataset_transform(self):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
         )
+
+
+class InappropriatenessClassificationv2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="InappropriatenessClassificationv2",
+        dataset={
+            "path": "mteb/InappropriatenessClassificationv2",
+            "revision": "698cb161a90150ec46618f714cdd8606cf21a9eb",
+        },
+        description="Inappropriateness identification in the form of binary classification",
+        reference="https://aclanthology.org/2021.bsnlp-1.4",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2006-01-01", "2021-04-01"),
+        domains=["Web", "Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@inproceedings{babakov-etal-2021-detecting,
+        title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation",
+        author = "Babakov, Nikolay  and
+        Logacheva, Varvara  and
+        Kozlova, Olga  and
+        Semenov, Nikita  and
+        Panchenko, Alexander",
+        editor = "Babych, Bogdan  and
+        Kanishcheva, Olga  and
+        Nakov, Preslav  and
+        Piskorski, Jakub  and
+        Pivovarova, Lidia  and
+        Starko, Vasyl  and
+        Steinberger, Josef  and
+        Yangarber, Roman  and
+        Marci{\'n}czuk, Micha{\l}  and
+        Pollak, Senja  and
+        P{\v{r}}ib{\'a}{\v{n}}, Pavel  and
+        Robnik-{\v{S}}ikonja, Marko",
+        booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing",
+        month = apr,
+        year = "2021",
+        address = "Kiyv, Ukraine",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2021.bsnlp-1.4",
+        pages = "26--36",
+        abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.",
+        }""",
+        prompt="Classify the given message as either sensitive topic or not",
+    )
diff --git a/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py b/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RuNLUIntentClassification(AbsTaskClassification, MultilingualTask):
+    metadata = TaskMetadata(
+        name="RuNLUIntentClassification",
+        dataset={
+            "path": "mteb/RuNLUIntentClassification",
+            "revision": "424d0f767aaa5c411e3a529eec04658e5726a39e",
+        },
+        description=(
+            "Contains natural language data for human-robot interaction in home domain which we collected and"
+            " annotated for evaluating NLU Services/platforms."
+        ),
+        reference="https://arxiv.org/abs/1903.05566",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs={
+            "rus-eng": [
+                "rus-Cyrl",
+                "rus-Latn",
+            ],
+            "rus": [
+                "rus-Cyrl",
+            ],
+        },
+        main_score="accuracy",
+        date=("2019-03-26", "2019-03-26"),
+        domains=[],
+        task_subtypes=["Intent classification"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@misc{liu2019benchmarkingnaturallanguageunderstanding,
+      title={Benchmarking Natural Language Understanding Services for building Conversational Agents},
+      author={Xingkun Liu and Arash Eshghi and Pawel Swietojanski and Verena Rieser},
+      year={2019},
+      eprint={1903.05566},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1903.05566},
+}""",
+    )
diff --git a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RuToxicOKMLCUPClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="RuToxicOKMLCUPClassification",
+        dataset={
+            "path": "mteb/RuToxicOKMLCUPClassification",
+            "revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
+        },
+        description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
+        reference="https://cups.online/ru/contests/okmlcup2020",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2015-01-01", "2020-01-01"),
+        domains=[],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""""",
+    )
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("toxic", "label")
diff --git a/mteb/tasks/Classification/rus/senti_ru_eval.py b/mteb/tasks/Classification/rus/senti_ru_eval.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SentiRuEval2016Classification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="SentiRuEval2016",
+        dataset={
+            "path": "mteb/SentiRuEval2016",
+            "revision": "8507eab0deef37f040a750afbcb4dba7a7de9c16",
+        },
+        description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks "
+        "and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, "
+        "and participants’ results.",
+        reference="https://github.com/mokoron/sentirueval",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2015-01-01", "2016-01-01"),
+        domains=[],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@inproceedings{loukachevitch2016sentirueval,
+  title={SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis},
+  author={Loukachevitch, NV and Rubtsova, Yu V},
+  booktitle={Computational Linguistics and Intellectual Technologies},
+  pages={416--426},
+  year={2016}
+}
+""",
+    )
diff --git a/mteb/tasks/MultiLabelClassification/__init__.py b/mteb/tasks/MultiLabelClassification/__init__.py
@@ -5,4 +5,5 @@
 from .multilingual.MultiEURLEXMultilabelClassification import *
 from .por.BrazilianToxicTweetsClassification import *
 from .rus.CEDRClassification import *
+from .rus.ru_toixic_multilabelclassification_okmlcup import *
 from .rus.SensitiveTopicsClassification import *
diff --git a/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py b/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskMultilabelClassification import (
+    AbsTaskMultilabelClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RuToxicOKMLCUPMultilabelClassification(AbsTaskMultilabelClassification):
+    metadata = TaskMetadata(
+        name="RuToxicOKMLCUPMultilabelClassification",
+        dataset={
+            "path": "mteb/RuToxicOKMLCUPClassification",
+            "revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
+        },
+        description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
+        reference="https://cups.online/ru/contests/okmlcup2020",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2015-01-01", "2024-01-01"),
+        domains=[],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""""",
+    )
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("labels", "label")