From 2892a825c35625ea3d7d970949c88433c1a85a31 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Sun, 27 Apr 2025 05:55:22 +0300 Subject: [PATCH 1/4] add 2 web SSL dino models --- mteb/models/dino_models.py | 63 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/mteb/models/dino_models.py b/mteb/models/dino_models.py index 847d7ee261..a01a4bb15d 100644 --- a/mteb/models/dino_models.py +++ b/mteb/models/dino_models.py @@ -7,7 +7,7 @@ from PIL import Image from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import AutoImageProcessor, AutoModel +from transformers import AutoImageProcessor, AutoModel, Dinov2Model from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta @@ -26,7 +26,10 @@ def __init__( ): self.model_name = model_name self.device = device - self.model = AutoModel.from_pretrained(model_name).to(self.device) + if "webssl" in model_name: + self.model = Dinov2Model.from_pretrained(model_name).to(self.device) + else: + self.model = AutoModel.from_pretrained(model_name).to(self.device) self.processor = AutoImageProcessor.from_pretrained(model_name) @staticmethod @@ -225,3 +228,59 @@ def get_fused_embeddings( use_instructions=False, training_datasets=dinov2_training_datasets, ) + +webssl_dino_training_datasets = { + # MetaCLIP 2B samples +} + +webssl_dino300m_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino300m-full2b-224", + ), + name="facebook/webssl-dino300m-full2b-224", + languages=["eng-Latn"], + revision="8529cdb3fb75014932af3b896455fc21c386168e", + release_date="2025-04-24", + modalities=["image"], + n_parameters=304_000_000, + memory_usage_mb=1158, + max_tokens=None, + embed_dim=1024, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino300m-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino1b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino1b-full2b-224", + ), + name="facebook/webssl-dino1b-full2b-224", + languages=["eng-Latn"], + revision="d3bf033d9c8cc62ea9e73c40956642cad2ec568a", + release_date="2025-04-24", + modalities=["image"], + n_parameters=1_130_000_000, + memory_usage_mb=4329, + max_tokens=None, + embed_dim=1536, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino1b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + + From dd47d01cf0570b06f9291607b7ae9f9347ee5f71 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Wed, 30 Apr 2025 06:52:46 +0300 Subject: [PATCH 2/4] add models from collection and revisions --- mteb/models/dino_models.py | 323 +++++++++++++++++++++++++++++++++++++ 1 file changed, 323 insertions(+) diff --git a/mteb/models/dino_models.py b/mteb/models/dino_models.py index a01a4bb15d..fa9a8e91f6 100644 --- a/mteb/models/dino_models.py +++ b/mteb/models/dino_models.py @@ -283,4 +283,327 @@ def get_fused_embeddings( training_datasets=webssl_dino_training_datasets, ) +webssl_dino2b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino2b-full2b-224", + ), + name="facebook/webssl-dino2b-full2b-224", + languages=["eng-Latn"], + revision="cd5893e3fd2e988eb716792049b3dd53b3f1b68b", + release_date="2025-04-24", + modalities=["image"], + n_parameters=2_080_000_000, + memory_usage_mb=7537, + max_tokens=None, + embed_dim=2688, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino2b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino3b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino3b-full2b-224", + ), + name="facebook/webssl-dino3b-full2b-224", + languages=["eng-Latn"], + revision="2d015c340b16bc47bc6557fcb4e6c83a9d4aa1d3", + release_date="2025-04-24", + modalities=["image"], + n_parameters=3_000_000_000, + memory_usage_mb=11200, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino3b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino5b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino5b-full2b-224", + ), + name="facebook/webssl-dino5b-full2b-224", + languages=["eng-Latn"], + revision="88006b18b9af369f6c611db7a64d908bde3714e0", + release_date="2025-04-24", + modalities=["image"], + n_parameters=5_000_000_000, + memory_usage_mb=18750, + max_tokens=None, + embed_dim=2560, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino5b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino7b_full8b_224 = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino7b-full8b-224", + ), + name="facebook/webssl-dino7b-full8b-224", + languages=["eng-Latn"], + revision="c6085463ea680043042a80c6d41db2c65e85f466", + release_date="2025-04-24", + modalities=["image"], + n_parameters=7_000_000_000, + memory_usage_mb=26250, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino7b-full8b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino7b_full8b_378 = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino7b-full8b-378", + ), + name="facebook/webssl-dino7b-full8b-378", + languages=["eng-Latn"], + revision="53c8c5b43070bd2ddb3f66161140408ce832301f", + release_date="2025-04-24", + modalities=["image"], + n_parameters=7_000_000_000, + memory_usage_mb=26250, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino7b-full8b-378", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino7b_full8b_518 = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino7b-full8b-518", + ), + name="facebook/webssl-dino7b-full8b-518", + languages=["eng-Latn"], + revision="aee350d2c5e3e5fdb7ee6985291d808ea5eef431", + release_date="2025-04-24", + modalities=["image"], + n_parameters=7_000_000_000, + memory_usage_mb=26250, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino7b-full8b-518", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino2b_light2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino2b-light2b-224", + ), + name="facebook/webssl-dino2b-light2b-224", + languages=["eng-Latn"], + revision="633a663f304e63cc3cbec3f7f9ca2fbc94736128", + release_date="2025-04-24", + modalities=["image"], + n_parameters=2_000_000_000, + memory_usage_mb=7537, + max_tokens=None, + embed_dim=1792, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino2b-light2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino2b_heavy2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino2b-heavy2b-224", + ), + name="facebook/webssl-dino2b-heavy2b-224", + languages=["eng-Latn"], + revision="9f46eb0c0129656a1ef195fde072e3765abdb7c6", + release_date="2025-04-24", + modalities=["image"], + n_parameters=2_000_000_000, + memory_usage_mb=7537, + max_tokens=None, + embed_dim=1792, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino2b-heavy2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino3b_light2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino3b-light2b-224", + ), + name="facebook/webssl-dino3b-light2b-224", + languages=["eng-Latn"], + revision="4d0160f60673805431f4ad14983e712ed88be5b8", + release_date="2025-04-24", + modalities=["image"], + n_parameters=3_000_000_000, + memory_usage_mb=11200, + max_tokens=None, + embed_dim=2048, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino3b-light2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) +webssl_dino3b_heavy2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino3b-heavy2b-224", + ), + name="facebook/webssl-dino3b-heavy2b-224", + languages=["eng-Latn"], + revision="dd39c2910747561b332285d96c4dce0bdb240775", + release_date="2025-04-24", + modalities=["image"], + n_parameters=3_000_000_000, + memory_usage_mb=11200, + max_tokens=None, + embed_dim=2048, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino3b-heavy2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_mae300m_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-mae300m-full2b-224", + ), + name="facebook/webssl-mae300m-full2b-224", + languages=["eng-Latn"], + revision="4655a0ac1726c206ba14d5ccb26758c62a4d03b0", + release_date="2025-04-24", + modalities=["image"], + n_parameters=304_000_000, + memory_usage_mb=1158, + max_tokens=None, + embed_dim=1024, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-mae300m-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_mae700m_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-mae700m-full2b-224", + ), + name="facebook/webssl-mae700m-full2b-224", + languages=["eng-Latn"], + revision="c32be382e757d73a178de1ead62c27391d4b4280", + release_date="2025-04-24", + modalities=["image"], + n_parameters=700_000_000, + memory_usage_mb=2660, + max_tokens=None, + embed_dim=1280, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-mae700m-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_mae1b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-mae1b-full2b-224", + ), + name="facebook/webssl-mae1b-full2b-224", + languages=["eng-Latn"], + revision="5880aefedbad8db0f44d27358f6f08e8576f70fc", + release_date="2025-04-24", + modalities=["image"], + n_parameters=1_000_000_000, + memory_usage_mb=3800, + max_tokens=None, + embed_dim=1408, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-mae1b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) From fa437d1b42ccfea5217f85cd2765c41ad314ec66 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Wed, 30 Apr 2025 16:37:15 +0300 Subject: [PATCH 3/4] update memory_usage_mb and embed dim --- mteb/models/dino_models.py | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/mteb/models/dino_models.py b/mteb/models/dino_models.py index fa9a8e91f6..177fe0d891 100644 --- a/mteb/models/dino_models.py +++ b/mteb/models/dino_models.py @@ -294,7 +294,7 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=2_080_000_000, - memory_usage_mb=7537, + memory_usage_mb=7951, max_tokens=None, embed_dim=2688, license="cc-by-nc-4.0", @@ -319,7 +319,7 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=3_000_000_000, - memory_usage_mb=11200, + memory_usage_mb=11247, max_tokens=None, embed_dim=3072, license="cc-by-nc-4.0", @@ -344,9 +344,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=5_000_000_000, - memory_usage_mb=18750, + memory_usage_mb=18838, max_tokens=None, - embed_dim=2560, + embed_dim=3584, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -369,9 +369,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=7_000_000_000, - memory_usage_mb=26250, + memory_usage_mb=24605, max_tokens=None, - embed_dim=3072, + embed_dim=4096, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -394,9 +394,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=7_000_000_000, - memory_usage_mb=26250, + memory_usage_mb=24613, max_tokens=None, - embed_dim=3072, + embed_dim=4096, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -419,9 +419,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=7_000_000_000, - memory_usage_mb=26250, + memory_usage_mb=24623, max_tokens=None, - embed_dim=3072, + embed_dim=4096, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -444,9 +444,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=2_000_000_000, - memory_usage_mb=7537, + memory_usage_mb=7951, max_tokens=None, - embed_dim=1792, + embed_dim=2688, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -469,9 +469,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=2_000_000_000, - memory_usage_mb=7537, + memory_usage_mb=7951, max_tokens=None, - embed_dim=1792, + embed_dim=2688, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -494,9 +494,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=3_000_000_000, - memory_usage_mb=11200, + memory_usage_mb=11247, max_tokens=None, - embed_dim=2048, + embed_dim=3072, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -519,9 +519,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=3_000_000_000, - memory_usage_mb=11200, + memory_usage_mb=11247, max_tokens=None, - embed_dim=2048, + embed_dim=3072, license="cc-by-nc-4.0", open_weights=True, public_training_code="", @@ -544,7 +544,7 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=304_000_000, - memory_usage_mb=1158, + memory_usage_mb=1161, max_tokens=None, embed_dim=1024, license="cc-by-nc-4.0", @@ -569,7 +569,7 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=700_000_000, - memory_usage_mb=2660, + memory_usage_mb=2412, max_tokens=None, embed_dim=1280, license="cc-by-nc-4.0", @@ -594,9 +594,9 @@ def get_fused_embeddings( release_date="2025-04-24", modalities=["image"], n_parameters=1_000_000_000, - memory_usage_mb=3800, + memory_usage_mb=4337, max_tokens=None, - embed_dim=1408, + embed_dim=1536, license="cc-by-nc-4.0", open_weights=True, public_training_code="", From 44ef88bb6129898f1becfc7a31c1dad8565f1823 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Wed, 30 Apr 2025 17:27:24 +0300 Subject: [PATCH 4/4] use automodel instead --- mteb/models/dino_models.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mteb/models/dino_models.py b/mteb/models/dino_models.py index 177fe0d891..b25d8770ee 100644 --- a/mteb/models/dino_models.py +++ b/mteb/models/dino_models.py @@ -7,7 +7,7 @@ from PIL import Image from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import AutoImageProcessor, AutoModel, Dinov2Model +from transformers import AutoImageProcessor, AutoModel from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta @@ -26,10 +26,7 @@ def __init__( ): self.model_name = model_name self.device = device - if "webssl" in model_name: - self.model = Dinov2Model.from_pretrained(model_name).to(self.device) - else: - self.model = AutoModel.from_pretrained(model_name).to(self.device) + self.model = AutoModel.from_pretrained(model_name).to(self.device) self.processor = AutoImageProcessor.from_pretrained(model_name) @staticmethod