From 4a3fdc3941a2b097bb5c97889f7dd0bc657e342d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:18:01 +0200
Subject: [PATCH 01/60] run dev branch

---
 .github/workflows/benchmark.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c1a1e82..500eb58 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -18,7 +18,6 @@ jobs:
   run-benchmark:
     name: Run Benchmark
     runs-on: ubuntu-latest
-    ## runs-on: self-hosted
     steps:
       - name: Check out repository
         uses: actions/checkout@v4
@@ -49,7 +48,7 @@ jobs:
         shell: bash -l {0}
         run: |
           mamba install -y pip
-          pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope
+          pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev
 
       - name: Load benchmark cache
         id: cache-benchmark
@@ -67,7 +66,6 @@ jobs:
   upload-artifact:
     name: Benchmark Artifact
     runs-on: ubuntu-latest
-    ## runs-on: self-hosted
     needs: run-benchmark
     if: always()
     steps:

From e89adda93e7fdc64b52b2e77dc702969a50f735c Mon Sep 17 00:00:00 2001
From: btraven <128150520+btraven00@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:20:31 +0100
Subject: [PATCH 02/60] docs: use the public repo URI

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a75c594..89d7c05 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@ A clustering example for omnibenchmark
 # How to run
 
 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/)
-2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git`
-3. Move to the cloned repository `cd clustering_example`
+2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example`
+3. Move into the cloned folder: `cd clustering_example`
 4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML  --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
 
 # Clustbench attribution

From 52ebb556eae88f36d2e857aadfe8189c4aca3eaf Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 13:02:37 +0100
Subject: [PATCH 03/60] chore: add convenience target to build singularity env

- make script executable
- use /bin/sh instead of /bin/bash
- add top-level Makefile to prepare env
---
 Makefile                  |  2 ++
 envs/build_singularity.sh | 14 +++++---------
 2 files changed, 7 insertions(+), 9 deletions(-)
 create mode 100644 Makefile
 mode change 100644 => 100755 envs/build_singularity.sh

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1e56cb2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,2 @@
+prepare_apptainer_env:
+	cd envs && ./build_singularity.sh
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
old mode 100644
new mode 100755
index 86e053f..c0c3d93
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,9 +1,5 @@
-#!/bin/bash
-
-sudo singularity build sklearn.sif sklearn_singularity.def
-
-sudo singularity build clustbench.sif clustbench_singularity.def
-
-sudo singularity build r.sif r_singularity.def
-
-sudo singularity build fcps.sif fcps_singularity.def
+#!/bin/sh
+singularity build sklearn.sif sklearn_singularity.def
+singularity build clustbench.sif clustbench_singularity.def
+singularity build r.sif r_singularity.def
+singularity build fcps.sif fcps_singularity.def

From 83c6f0b0c78851d93be5956fd27a8180c61b2ba7 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 13:59:05 +0100
Subject: [PATCH 04/60] feat: parametrize num threads on the makefile

---
 Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Makefile b/Makefile
index 1e56cb2..3c58e2f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,2 +1,10 @@
+MAX_THREADS ?= 30
+OB_CMD="ob run benchmark -k --local"
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
+run_with_apptainer_backend:
+	 ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS}
+	 mv out out_apptainer
+run_with_conda_backend:
+	 ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS}
+	 mv out out_conda

From dc2d629004fcdb40f75bc24194287b961eb40283 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 14:02:21 +0100
Subject: [PATCH 05/60] chore: ignore common temporary outputs and image build
 artifacts

---
 .gitignore | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4d38534
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# image build artifacts
+envs/*.sif
+
+# snakemake
+snakemake.log
+.snakemake/
+
+# vim swaps
+*.swp
+*.swo

From f91603aecf8f82975087c89615d3473d4b79c12f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 13:59:05 +0100
Subject: [PATCH 06/60] feat: parametrize num threads on the makefile

---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3c58e2f..6883fa0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 MAX_THREADS ?= 30
-OB_CMD="ob run benchmark -k --local"
+# by default, we want to run all snakemake rules even if there are failures
+OB_CMD=ob run benchmark -k --local
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 run_with_apptainer_backend:

From bea2a75173f9c19edb2adb1c22bc1ab90d62774d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Mon, 5 May 2025 10:07:36 +0200
Subject: [PATCH 07/60] fix: use --cores, --task-timeout

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 6883fa0..73b33b5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,11 @@
-MAX_THREADS ?= 30
+MAX_CORES ?= 10
 # by default, we want to run all snakemake rules even if there are failures
-OB_CMD=ob run benchmark -k --local
+OB_CMD=ob run benchmark -k --local --task-timeout "4h"
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 run_with_apptainer_backend:
-	 ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS}
+	 ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES}
 	 mv out out_apptainer
 run_with_conda_backend:
-	 ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS}
+	 ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES}
 	 mv out out_conda

From 67e8cf8bc7e0deab9f6bfdc5aceaffe39841040e Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Wed, 7 May 2025 21:41:46 +0200
Subject: [PATCH 08/60] update .eb files to easybuild 5.0

---
 Makefile           |  9 ++++--
 envs/clustbench.eb | 81 ++++++----------------------------------------
 envs/fcps.eb       | 18 ++++-------
 3 files changed, 23 insertions(+), 85 deletions(-)

diff --git a/Makefile b/Makefile
index 73b33b5..e107f62 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,14 @@
 MAX_CORES ?= 10
 # by default, we want to run all snakemake rules even if there are failures
-OB_CMD=ob run benchmark -k --local --task-timeout "4h"
+OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES}
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
+prepare_envmodules_env:
+	cd envs && eb clustbench.eb --robot
+	cd envs && eb fcps.eb --robot
 run_with_apptainer_backend:
-	 ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES}
+	 ${OB_CMD} -b Clustering_singularity.yml
 	 mv out out_apptainer
 run_with_conda_backend:
-	 ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES}
+	 ${OB_CMD} -b Clustering_conda.yml
 	 mv out out_conda
diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index 22597fb..f3ee681 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -1,108 +1,47 @@
-## largely as https://github.com/easybuilders/easybuild-easyconfigs/blob/949c266db9e17440ec2829eb8ffdbdb87ceaf543/easybuild/easyconfigs/c/cooler/cooler-0.10.2-foss-2023b.eb#L4
-
 easyblock = 'PythonBundle'
 
 name = 'clustbench'
-version = '1'
+version = '0.1.0'
 
-homepage = 'https://python.org/'
+homepage = 'https://omnibenchmark.org'
 description = "Bundle of Python packages for ob clustering_example"
 
 toolchain = {'name': 'foss', 'version': '2023b'}
 
-
 dependencies = [
     ('Python', '3.11.5'),
-    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
     ('SciPy-bundle', '2023.11'),
-    ('meson-python', '0.15.0'),
     ('matplotlib', '3.8.2'),
-    ('scikit-learn', '1.4.0')
-
+    ('scikit-learn', '1.4.0'),
+#    ('meson-python', '0.15.0'),
+#    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
 ]
 
-sanity_pip_check = True 
-use_pip = True
-
-exts_default_options = {
-    'sanity_pip_check': True,
-    'use_pip' : True
-}
-
-## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz
-## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
-## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz
-## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz
-## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
-
-source_urls = [PYPI_SOURCE,
-               'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/',
-               'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/',
-               'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/',
-               'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/',
-               'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/',
-               'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/',
-               'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/',
-               'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa',
-               'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/',
-               'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/',
-               'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/',
-               'https://github.com/pybind/pybind11/archive/',
-               'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/',
-               'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz',
-               'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz',
-               'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz',
-               'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip',
-               'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz',
-               'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz',
-               'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz',
-               'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz',
-               'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz',
-               'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz']
-
-
-## caution download genieclust here, not pypi, they differ and pypi's it's not installable!
-## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/
-## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz
-## todo automate this within the easyconfig!
-
 exts_list = [
     ('natsort', '8.4.0', {
         'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'],
     }),
-    ('cython', '3.0.11', {
-        'checksums': ['7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff'],
-    }),
     ('hypothesis', '6.124.7', {
         'checksums': ['8ed6c6ae47e7d26d869c1dc3dee04e8fc50c95240715bb9915ded88d6d920f0e'],
     }),
-    ('numpy', '1.26.4', {
-        'checksums': ['2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010'],
-    }),
     ('fastcluster', '1.2.6', {
         'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'],
     }),
-    ('genieclust', '1.1.6', {
-        'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'],
-    }),
     ('hurry.filesize', '0.9', {
         'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'],
     }),
     ('python3-wget', '0.0.2-beta1', {
         'modulename': 'wget',
+        'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'],
         'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'],
     }),
-    ('clustering_benchmarks', '1.1.5', {
-        'modulename': 'clustbench',
-        'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'],
+    ('genieclust', '1.1.6', {
+        'download_dep_fail': False,
+        'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl',
+        'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'],
     }),
 ]
 
-sanity_check_paths = {
-    'files': [],
-    'dirs': ['lib/python3.11/site-packages/clustbench/']
-}
-
 moduleclass = 'bio'
 
 
diff --git a/envs/fcps.eb b/envs/fcps.eb
index ee3db52..54c8c7d 100644
--- a/envs/fcps.eb
+++ b/envs/fcps.eb
@@ -1,6 +1,3 @@
-## largely as in https://github.com/easybuilders/easybuild-easyconfigs/commit/e9a36171c68414f933ab1afa03b32422491f0f96#diff-3f2a92ab6ab59ddaccf4bc61b59bdd3f6717b95fd019131a57f51eefc831a699
-## Caution boost easyconfig needs update  https://raw.githubusercontent.com/easybuilders/easybuild-easyconfigs/refs/heads/develop/easybuild/easyconfigs/b/Boost/Boost-1.82.0-GCC-12.3.0.eb (https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2)
-
 easyblock = 'Bundle'
 
 name = 'fcps'
@@ -23,10 +20,7 @@ dependencies = [
 
 exts_default_options = {
     'source_urls': [
-        'https://bioconductor.org/packages/3.18/bioc/src/contrib/',
-        'https://bioconductor.org/packages/3.18/bioc/src/contrib/Archive/%(name)s',
-        'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/',
-        'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/',
+	'https://bioconductor.org/packages/release/bioc/src/contrib/',
         'https://cran.r-project.org/src/contrib/Archive/%(name)s',  # package archive
         'https://cran.r-project.org/src/contrib/',  # current version of packages
         'https://cran.freestatistics.org/src/contrib',  # mirror alternative for current packages
@@ -192,13 +186,15 @@ exts_list = [
     ('cluster', '2.1.8', {
         'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'],
     }),
-    ('graph', '1.84.1', {
-        'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'],
+    ('graph', '1.86.0', {
+        'checksums': ['ac9e196dfcb43848a851ea2d339cff41f8f16c7e80e76282c8fe7b822df8f367'],
     }),
     ('mclust', '6.1.1', {
         'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'],
     }),
-    ('cclust', '0.6-26'),
+    ('cclust', '0.6-26', {
+        'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'],
+    }),
     ('flowClust', '3.40.0', {
         'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'",
         'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'],
@@ -240,4 +236,4 @@ sanity_check_paths = {
     'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'],
 }
 
-moduleclass = 'bio'
\ No newline at end of file
+moduleclass = 'bio'

From 931389f796ef8ceb7e4951c80c708e1b2c2129b1 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:38:02 +0200
Subject: [PATCH 09/60] remove remote storage

---
 Clustering.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Clustering.yaml b/Clustering.yaml
index 0007ea5..689be2c 100644
--- a/Clustering.yaml
+++ b/Clustering.yaml
@@ -2,10 +2,10 @@ id: clustering_example
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
 version: 1.2
 benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: https://play.min.io
 benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clustering_example
+# storage: https://play.min.io
+# storage_api: S3
+# storage_bucket_name: clustering_example
 software_backend: conda
 software_environments:
   clustbench:

From 60ac47b3c55bec65b5ad839d524a7b8cd87b1b4c Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:41:44 +0200
Subject: [PATCH 10/60] do not run artifact if not in main repo

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 500eb58..2a55846 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -67,7 +67,7 @@ jobs:
     name: Benchmark Artifact
     runs-on: ubuntu-latest
     needs: run-benchmark
-    if: always()
+    if: github.ref == 'refs/heads/main' && github.repository_owner == 'omnibenchmark'
     steps:
       - name: Check out repository
         uses: actions/checkout@v4

From 1b972bfef0d7a74199d0289d8b7b8749720bce27 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:45:12 +0200
Subject: [PATCH 11/60] Update Makefile

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e107f62..875a375 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 MAX_CORES ?= 10
-# by default, we want to run all snakemake rules even if there are failures
+# by default, we want to run all snakemake rules even if there are failures (-k)
 OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES}
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
@@ -12,3 +12,6 @@ run_with_apptainer_backend:
 run_with_conda_backend:
 	 ${OB_CMD} -b Clustering_conda.yml
 	 mv out out_conda
+run_with_envmodules_backend:
+	 ${OB_CMD} -b Clustering_envmodules.yml
+	 mv out out_lmod

From 49646db648dee014b3a43f655ef64147cbda6ed0 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 13:22:56 +0200
Subject: [PATCH 12/60] streamline envmodules yaml

---
 Clustering_envmodules.yml | 281 ++++++++++++++++++++------------------
 1 file changed, 149 insertions(+), 132 deletions(-)

diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 3c2b8bd..1ab4808 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -2,32 +2,21 @@ id: clustering_example_envmodules
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
 version: 1.4
 benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: http://omnibenchmark.org:9000
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clusteringexampleenvmodules
+benchmark_yaml_spec: 0.5
+
 software_backend: envmodules
+
 software_environments:
   clustbench:
     description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
-    envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
+    envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho
+    conda: na
+    apptainer: na
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
+    envmodule: rmarkdown # TODO
+    conda: na
+    apptainer: na
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
@@ -56,42 +45,132 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
-        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
+          - values: [
+                "--dataset_generator",
+                "fcps",
+                "--dataset_name",
+                "chainlink",
+              ] #	2	1
+          - values: [
+                "--dataset_generator",
+                "fcps",
+                "--dataset_name",
+                "engytime",
+              ] #	2	2
           - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #	7	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] #	3	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] #	2, 6	2
           - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #	4	1
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #	2	1
+          - values: [
+                "--dataset_generator",
+                "fcps",
+                "--dataset_name",
+                "twodiamonds",
+              ] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #	2	1
           - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] #	2, 4, 5	6
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "fuzzyx",
+              ] #	2, 4, 5	6
           - values: ["--dataset_generator", "graves", "--dataset_name", "line"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #	2, 4	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "parabolic",
+              ] #	2, 4	2
           - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #	2, 5	2
-          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] #	3, 5	2
-          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] #	3, 5	2
-          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #	3, 5	2
-          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] #	6	1
-          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] #	6	1
-          - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] #	6	1
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "ring_noisy",
+              ] #	2	1
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "ring_outliers",
+              ] #	2, 5	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "zigzag",
+              ] #	3, 5	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "zigzag_noisy",
+              ] #	3, 5	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "zigzag_outliers",
+              ] #	3, 5	2
+          - values: [
+                "--dataset_generator",
+                "other",
+                "--dataset_name",
+                "chameleon_t4_8k",
+              ] #	6	1
+          - values: [
+                "--dataset_generator",
+                "other",
+                "--dataset_name",
+                "chameleon_t5_8k",
+              ] #	6	1
+          - values: [
+                "--dataset_generator",
+                "other",
+                "--dataset_name",
+                "hdbscan",
+              ] #	6	1
           - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #	3	1
           - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] #	3	1
           - values: ["--dataset_generator", "other", "--dataset_name", "square"] #	2	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #	7	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] #	4, 5, 6	5
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "aggregation",
+              ] #	7	1
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "compound",
+              ] #	4, 5, 6	5
           - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #	2	2
           - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] #	2	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #	3, 4	2
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "pathbased",
+              ] #	3, 4	2
           - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #	8, 9, 15	3
           - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] #	3	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #	8	1
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "unbalance",
+              ] #	8	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] #	8	1
-          - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #	2	1
+          - values: [
+                "--dataset_generator",
+                "uci",
+                "--dataset_name",
+                "ionosphere",
+              ] #	2	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] #	2	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] #	7	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #	2	1
@@ -100,8 +179,18 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] #	10	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] #	3	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] #	6	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "isolation",
+              ] #	3	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "labirynth",
+              ] #	6	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] #	2	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] #	3	1
@@ -109,9 +198,24 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] #	5	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] #	4, 6	2
           - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] #	2	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #	4	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #	3	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] #	2	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "trajectories",
+              ] #	4	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "trapped_lovers",
+              ] #	3	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "twosplashes",
+              ] #	2	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] #	5	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #	3	1
@@ -126,7 +230,7 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
 
   ## clustbench methods (fastcluster) ###################################################################
-  
+
   - id: clustering
     modules:
       - id: fastcluster
@@ -148,7 +252,6 @@ stages:
         software_environment: "clustbench"
         repository:
           url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
           commit: 5877378
         parameters:
           - values: ["--method", "birch"]
@@ -229,89 +332,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"

From fc53991d1eb32c7749c3f1a2bccc0ed9e33601af Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 15:14:35 +0200
Subject: [PATCH 13/60] update clustbench

---
 Clustering_envmodules.yml | 38 ++++++++++++++++++++------------------
 envs/clustbench.eb        |  5 +++++
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 1ab4808..f37fd6c 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -9,23 +9,24 @@ software_backend: envmodules
 software_environments:
   clustbench:
     description: "clustbench on py3.12.6"
-    envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho
-    conda: na
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: na
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
     apptainer: na
   rmarkdown:
     description: "R with some plotting dependencies"
     envmodule: rmarkdown # TODO
-    conda: na
+    conda: envs/clustbench.yml
     apptainer: na
-  fcps:
-    description: "CRAN's FCPS"
-    conda: envs/fcps.yml
-    apptainer: envs/fcps.sif
-    envmodule: fcps
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
+    software_environment: rmarkdown
     repository:
       url: https://github.com/imallona/clustering_report
       commit: 1d6bdf5
@@ -34,14 +35,15 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
+
 stages:
-  ## clustbench data ##########################################################
 
   - id: data
+    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
@@ -235,7 +237,7 @@ stages:
     modules:
       - id: fastcluster
         name: "fastcluster algorithm"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
           # url: /home/imallona/src/clustbench_fastcluster/
@@ -249,7 +251,7 @@ stages:
           - values: ["--linkage", "centroid"]
       - id: sklearn
         name: "sklearn"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
           commit: 5877378
@@ -260,7 +262,7 @@ stages:
           - values: ["--method", "gm"]
       - id: agglomerative
         name: "agglomerative"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_agglomerative
           commit: 5454368
@@ -270,7 +272,7 @@ stages:
           - values: ["--linkage", "ward"]
       - id: genieclust
         name: "genieclust"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
           commit: 6090043
@@ -280,7 +282,7 @@ stages:
           - values: ["--method", "ica"]
       - id: fcps
         name: "fcps"
-        software_environment: "fcps"
+        software_environment: fcps
         repository:
           url: https://github.com/imallona/clustbench_fcps
           commit: 272fa5f
@@ -309,10 +311,10 @@ stages:
     modules:
       - id: partition_metrics
         name: "clustbench partition metrics"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
-          commit: 9132d45
+          commit: 8184cd4
         parameters:
           - values: ["--metric", "normalized_clustering_accuracy"]
           - values: ["--metric", "adjusted_fm_score"]
diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index f3ee681..0e86911 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -13,6 +13,7 @@ dependencies = [
     ('SciPy-bundle', '2023.11'),
     ('matplotlib', '3.8.2'),
     ('scikit-learn', '1.4.0'),
+# FIXME: I think this is not needed -- ben
 #    ('meson-python', '0.15.0'),
 #    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
 ]
@@ -40,6 +41,10 @@ exts_list = [
         'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl',
         'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'],
     }),
+    ('clustering_benchmarks', '1.1.6', {
+        'modulename': 'clustbench',
+        'checksums': ['8c3ac0aed7c4c4925df6e5000db29aed6359341bd1ef2e516f230e13d8b66a0c'],
+    }),
 ]
 
 moduleclass = 'bio'

From 54b72790b1e2d2d9aa66d30c3d956b5d8be387a3 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sat, 10 May 2025 18:46:41 +0200
Subject: [PATCH 14/60] add rmarkdown-python bundles, without checksums

---
 envs/rmarkdown-python.eb | 28 ++++++++++++
 envs/rmarkdown.eb        | 94 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 envs/rmarkdown-python.eb
 create mode 100644 envs/rmarkdown.eb

diff --git a/envs/rmarkdown-python.eb b/envs/rmarkdown-python.eb
new file mode 100644
index 0000000..a9edb00
--- /dev/null
+++ b/envs/rmarkdown-python.eb
@@ -0,0 +1,28 @@
+easyblock = 'Bundle'
+
+# This is a dummy bundle that installs:
+# 1. rmarkdown: an R bundle that we also package
+# 2. Python-3.12.3-GCCcore-13.3.0
+# This is a dependency for the clustering_benchmark metric collector.
+
+name = 'rmarkdown-python'
+version = '0.1.0'
+
+local_rver = '4.4.2'
+local_pyver = '3.12.3'
+versionsuffix = f'-r-{local_rver}-py-{local_pyver}'
+
+homepage = 'https://omnibenchmark.org'
+description = 'Rmarkdown bundle with specific Python dependency'
+
+toolchain = {'name': 'system', 'version': '1.0'}
+
+dependencies = [
+    ('rmarkdown', '0.1.0', f'-gfbf-2024a-r-{local_rver}'),
+    ('Python', local_pyver, '-GCCcore-13.3.0'),
+]
+
+sanity_check_paths = {
+    'files': [],
+    'dirs': ['../../rmarkdown/0.1.0-gfbf-2024a-r-4.4.2']
+}
diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb
new file mode 100644
index 0000000..a88a2a9
--- /dev/null
+++ b/envs/rmarkdown.eb
@@ -0,0 +1,94 @@
+easyblock = 'Bundle'
+
+# TODO(ben): Try to use https://www.eessi.io/docs/available_software/detail/R-bundle-CRAN/
+# and build only what's left out.
+
+name = 'rmarkdown'
+version = '0.1.0'
+versionsuffix = '-r-%(rver)s'
+
+homepage = 'https://omnibenchmark.org'
+description = 'rmarkdown bundle for clustbench reports'
+
+toolchain = {'name': 'gfbf', 'version': '2024a'}
+
+dependencies = [
+    ('R', '4.4.2'),
+]
+
+exts_default_options = {
+    'source_urls': [
+        'https://cloud.r-project.org/src/contrib/',
+        'https://cran.r-project.org/src/contrib/',  				# current version of packages
+        'https://cran.r-project.org/src/contrib/Archive/%(name)s',  		# package archive
+        'https://www.bioconductor.org/packages/release/bioc/src/contrib/',      # bioconductor
+    ],
+    'sources': ['%(name)s_%(version)s.tar.gz'],
+}
+
+exts_defaultclass = 'RPackage'
+
+
+exts_list = [
+    ('rlang', '1.1.6'),
+    ('glue', '1.8.0'),
+    ('cli', '3.6.4'),
+    ('lifecycle', '1.0.4'),
+    ('vctrs', '0.6.5'),
+    ('utf8', '1.2.4'),
+    ('lattice', '0.22-5'),
+    ('pkgconfig', '2.0.3'),
+    ('pillar', '1.10.2'),
+    ('magrittr', '2.0.3'),
+    ('fansi', '1.0.6'),
+    ('viridisLite', '0.4.2'),
+    ('RColorBrewer', '1.1-3'),
+    ('R6', '2.6.1'),
+    ('labeling', '0.4.3'),
+    ('farver', '2.1.2'),
+    ('Matrix', '1.7-3'),
+    ('nlme', '3.1-168'),
+    ('withr', '3.0.2'),
+    ('tibble', '3.2.1'),
+    ('colorspace', '2.1-1'),
+    ('munsell', '0.5.1'),
+    ('scales', '1.3.0'),
+    ('mgcv', '1.9-1'),
+    ('MASS', '7.3-65'),
+    ('isoband', '0.2.7'),
+    ('gtable', '0.3.6'),
+    ('ggplot2', '3.5.2'),
+    ('findpython', '1.0.9', {}),
+    ('argparse', '2.2.5', {}),
+    ('rmarkdown', '2.29', {}),
+    ('generics', '0.1.3', {}),
+    ('tidyselect', '1.2.1', {}),
+    ('dplyr', '1.1.4', {}),
+    ('tidyr', '1.3.1', {}),
+    ('shape', '1.4.6.1', {}),
+    ('GlobalOptions', '0.1.2', {}),
+    ('circlize', '0.4.16', {}),
+    ('rjson', '0.2.23', {}),
+    ('GetoptLong', '1.0.5', {}),
+    ('cluster', '2.1.8.1', {}),
+    ('clue', '0.3-66', {}),
+    ('png', '0.1-8', {}),
+    ('BiocGenerics', '0.54.0', {}),
+    ('S4Vectors', '0.46.0', {}),
+    ('IRanges', '2.42.0'),
+    ('matrixStats', '1.5.0', {}),
+    ('iterators', '1.0.14', {}),
+    ('codetools', '0.2-20', {}),
+    ('foreach', '1.5.2', {}),
+    ('doParallel', '1.0.17', {}),
+    ('ComplexHeatmap', '2.24.0', {}),
+]
+
+modextrapaths = {'R_LIBS_SITE': ''}
+
+sanity_check_paths = {
+    'files': [],
+    'dirs': ['argparse', 'rmarkdown', 'ggplot2', 'tidyr', 'ComplexHeatmap'],
+}
+
+moduleclass = 'bio'

From 1b57e44585c688d6f5e8f5be4b38b039e73cab57 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sat, 10 May 2025 18:49:27 +0200
Subject: [PATCH 15/60] inject checksums to rmarkdown easyconfig

---
 envs/rmarkdown.eb | 209 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 156 insertions(+), 53 deletions(-)

diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb
index a88a2a9..067eadd 100644
--- a/envs/rmarkdown.eb
+++ b/envs/rmarkdown.eb
@@ -28,60 +28,163 @@ exts_default_options = {
 
 exts_defaultclass = 'RPackage'
 
-
 exts_list = [
-    ('rlang', '1.1.6'),
-    ('glue', '1.8.0'),
-    ('cli', '3.6.4'),
-    ('lifecycle', '1.0.4'),
-    ('vctrs', '0.6.5'),
-    ('utf8', '1.2.4'),
-    ('lattice', '0.22-5'),
-    ('pkgconfig', '2.0.3'),
-    ('pillar', '1.10.2'),
-    ('magrittr', '2.0.3'),
-    ('fansi', '1.0.6'),
-    ('viridisLite', '0.4.2'),
-    ('RColorBrewer', '1.1-3'),
-    ('R6', '2.6.1'),
-    ('labeling', '0.4.3'),
-    ('farver', '2.1.2'),
-    ('Matrix', '1.7-3'),
-    ('nlme', '3.1-168'),
-    ('withr', '3.0.2'),
-    ('tibble', '3.2.1'),
-    ('colorspace', '2.1-1'),
-    ('munsell', '0.5.1'),
-    ('scales', '1.3.0'),
-    ('mgcv', '1.9-1'),
-    ('MASS', '7.3-65'),
-    ('isoband', '0.2.7'),
-    ('gtable', '0.3.6'),
-    ('ggplot2', '3.5.2'),
-    ('findpython', '1.0.9', {}),
-    ('argparse', '2.2.5', {}),
-    ('rmarkdown', '2.29', {}),
-    ('generics', '0.1.3', {}),
-    ('tidyselect', '1.2.1', {}),
-    ('dplyr', '1.1.4', {}),
-    ('tidyr', '1.3.1', {}),
-    ('shape', '1.4.6.1', {}),
-    ('GlobalOptions', '0.1.2', {}),
-    ('circlize', '0.4.16', {}),
-    ('rjson', '0.2.23', {}),
-    ('GetoptLong', '1.0.5', {}),
-    ('cluster', '2.1.8.1', {}),
-    ('clue', '0.3-66', {}),
-    ('png', '0.1-8', {}),
-    ('BiocGenerics', '0.54.0', {}),
-    ('S4Vectors', '0.46.0', {}),
-    ('IRanges', '2.42.0'),
-    ('matrixStats', '1.5.0', {}),
-    ('iterators', '1.0.14', {}),
-    ('codetools', '0.2-20', {}),
-    ('foreach', '1.5.2', {}),
-    ('doParallel', '1.0.17', {}),
-    ('ComplexHeatmap', '2.24.0', {}),
+    ('rlang', '1.1.6', {
+        'checksums': ['18544c876f4e18ec554edecc308362a52fbc7e0805c4794cf59bcc4d0b57f330'],
+    }),
+    ('glue', '1.8.0', {
+        'checksums': ['c86f364ba899b8662f5da3e1a75f43ae081ab04e0d51171d052356e7ee4b72a0'],
+    }),
+    ('cli', '3.6.4', {
+        'checksums': ['0c39539ce173bcbf7abaca64e8d2c87ffec8257c144c31b793c4cf2dd9cf7620'],
+    }),
+    ('lifecycle', '1.0.4', {
+        'checksums': ['ada4d3c7e84b0c93105e888647c5754219a8334f6e1f82d5afaf83d4855b91cc'],
+    }),
+    ('vctrs', '0.6.5', {
+        'checksums': ['43167d2248fd699594044b5c8f1dbb7ed163f2d64761e08ba805b04e7ec8e402'],
+    }),
+    ('utf8', '1.2.4', {
+        'checksums': ['418f824bbd9cd868d2d8a0d4345545c62151d321224cdffca8b1ffd98a167b7d'],
+    }),
+    ('lattice', '0.22-5', {
+        'checksums': ['ba1fbe5e18a133507dca9851b7f933002bdb6d1f3ea5f410a0a441103b6da5f1'],
+    }),
+    ('pkgconfig', '2.0.3', {
+        'checksums': ['330fef440ffeb842a7dcfffc8303743f1feae83e8d6131078b5a44ff11bc3850'],
+    }),
+    ('pillar', '1.10.2', {
+        'checksums': ['2cdbe3fe1b28b62530880ab26fc3c874e0dd5060767ae1a8ee5685f65e56d645'],
+    }),
+    ('magrittr', '2.0.3', {
+        'checksums': ['a2bff83f792a1acb801bfe6330bb62724c74d5308832f2cb6a6178336ace55d2'],
+    }),
+    ('fansi', '1.0.6', {
+        'checksums': ['ea9dc690dfe50a7fad7c5eb863c157d70385512173574c56f4253b6dfe431863'],
+    }),
+    ('viridisLite', '0.4.2', {
+        'checksums': ['893f111d31deccd2cc959bc9db7ba2ce9020a2dd1b9c1c009587e449c4cce1a1'],
+    }),
+    ('RColorBrewer', '1.1-3', {
+        'checksums': ['4f42f5423c45688b39f492c7892d93f37b4541831c8ffb140364d2bd89031ac0'],
+    }),
+    ('R6', '2.6.1', {
+        'checksums': ['59c6eba8b1b912eb7e104f65053235604be853425ee67c152ac4e86a1f2073b4'],
+    }),
+    ('labeling', '0.4.3', {
+        'checksums': ['c62f4fc2cc74377d7055903c5f1913b7295f7587456fe468592738a483e264f2'],
+    }),
+    ('farver', '2.1.2', {
+        'checksums': ['528823b95daab4566137711f1c842027a952bea1b2ae6ff098e2ca512b17fe25'],
+    }),
+    ('Matrix', '1.7-3', {
+        'checksums': ['6642e9db8cddf32a051972fd5a634bf7edbdc925c5c2d139bf71e92df00fb44e'],
+    }),
+    ('nlme', '3.1-168', {
+        'checksums': ['23b78468344cb6775dee5e0d9c8133032d64f08ebaba20776508a0443a897362'],
+    }),
+    ('withr', '3.0.2', {
+        'checksums': ['0a3a05f493d275cca4bf13c8c1b95a1a4eed7f83b2493f41fde02ce3fc92c1a3'],
+    }),
+    ('tibble', '3.2.1', {
+        'checksums': ['65a72d0c557fd6e7c510d150c935ed6ced5db7d05fc20236b370f11428372131'],
+    }),
+    ('colorspace', '2.1-1', {
+        'checksums': ['e721cee5f4d6e4b0fc8eb18265e316b4f856fd3be02f0775a26032663758cd0b'],
+    }),
+    ('munsell', '0.5.1', {
+        'checksums': ['03a2fd9ac40766cded96dfe33b143d872d0aaa262a25482ce19161ca959429a6'],
+    }),
+    ('scales', '1.3.0', {
+        'checksums': ['b33e0f6b44259551ce02befd52eac53602509fbfdd903920620c658c50f35888'],
+    }),
+    ('mgcv', '1.9-1', {
+        'checksums': ['700fbc37bedd3a49505b9bc4949faee156d9cfb4f669d797d06a10a15a5bdb32'],
+    }),
+    ('MASS', '7.3-65', {
+        'checksums': ['b07ef1e3c364ce56269b4a8a7759cc9f87c876554f91293437bb578cfe38172f'],
+    }),
+    ('isoband', '0.2.7', {
+        'checksums': ['7693223343b45b86de2b5b638ff148f0dafa6d7b1237e822c5272902f79cdf61'],
+    }),
+    ('gtable', '0.3.6', {
+        'checksums': ['d305a5fa11278b649d2d8edc5288bf28009be888a42be58ff8714018e49de0ef'],
+    }),
+    ('ggplot2', '3.5.2', {
+        'checksums': ['0a30024a2ff3e569412223c8f14563ed504f3e0851de03e42d1b5f73fe1f06bf'],
+    }),
+    ('findpython', '1.0.9', {
+        'checksums': ['b6a15e0cdfcdd4b1cfc76f7e4eaad0125d4d52889711200075280e9b2a2cb7cb'],
+    }),
+    ('argparse', '2.2.5', {
+        'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'],
+    }),
+    (name, '2.29', {
+        'checksums': ['6662ac85316c869caad6e3b95468cad97f6eef106d47b066db8d40c05a490928'],
+    }),
+    ('generics', '0.1.3', {
+        'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'],
+    }),
+    ('tidyselect', '1.2.1', {
+        'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'],
+    }),
+    ('dplyr', '1.1.4', {
+        'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'],
+    }),
+    ('tidyr', '1.3.1', {
+        'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'],
+    }),
+    ('shape', '1.4.6.1', {
+        'checksums': ['43f9bd0f997fd6cf1838efd8b2509c9a6396513f4e54a20360481634affd22a4'],
+    }),
+    ('GlobalOptions', '0.1.2', {
+        'checksums': ['47890699668cfa9900a829c51f8a32e02a7a7764ad07cfac972aad66f839753e'],
+    }),
+    ('circlize', '0.4.16', {
+        'checksums': ['16dc32c7704906d13a9e5281bb396e92fb89a6b17fa5e201953240726b650b67'],
+    }),
+    ('rjson', '0.2.23', {
+        'checksums': ['55034575c854ed657e6701da278c0fdea251479624d06a963b2e58461a5f0f48'],
+    }),
+    ('GetoptLong', '1.0.5', {
+        'checksums': ['8c237986ed3dfb72d956ad865ef7768644eebf144675ad66140acfd1aca9d701'],
+    }),
+    ('cluster', '2.1.8.1', {
+        'checksums': ['4b95b78e09b17ddca72edc0bb180c753c004ed2f61c3eb12e0451ac77f441e57'],
+    }),
+    ('clue', '0.3-66', {
+        'checksums': ['aa86dd58c05635eb394c9ede0dd15a4f24af4815f299451bbc7895c0f737c2fb'],
+    }),
+    ('png', '0.1-8', {
+        'checksums': ['5a36fabb6d62ba2533d3fc4cececd07891942cfb76fe689ec0d550d08762f61c'],
+    }),
+    ('BiocGenerics', '0.54.0', {
+        'checksums': ['413d6f74cbc671147f63eefc46b718af815d6497535c2198925d9306e00c41b9'],
+    }),
+    ('S4Vectors', '0.46.0', {
+        'checksums': ['c34249c6a367a2a1e94158d9e60294f2b901e485d93717250a417569be187a40'],
+    }),
+    ('IRanges', '2.42.0', {
+        'checksums': ['0abb01ee93111c5fc678f9aa2f93d00d8d1548263cb60daa52645a6061b603fc'],
+    }),
+    ('matrixStats', '1.5.0', {
+        'checksums': ['12996c5f3e6fc202a43e1087f16a71b7fa93d7e908f512542c7ee89cf95dcc15'],
+    }),
+    ('iterators', '1.0.14', {
+        'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'],
+    }),
+    ('codetools', '0.2-20', {
+        'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'],
+    }),
+    ('foreach', '1.5.2', {
+        'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'],
+    }),
+    ('doParallel', '1.0.17', {
+        'checksums': ['b96a25ad105a654d70c7b4ca27290dc9967bc47f4668b2763927a886b178abd7'],
+    }),
+    ('ComplexHeatmap', '2.24.0', {
+        'checksums': ['2a015ad26c5a5f003ee203d77cc8d3eea5461bcf2db7ce102da1bef7db082650'],
+    }),
 ]
 
 modextrapaths = {'R_LIBS_SITE': ''}

From dfd5b936195655c136bf513640c5a5196a7785ea Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:18:59 +0200
Subject: [PATCH 16/60] update sklearn singularity definition

---
 envs/sklearn_singularity.def | 57 ++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def
index 939a3bb..56bcf37 100644
--- a/envs/sklearn_singularity.def
+++ b/envs/sklearn_singularity.def
@@ -1,33 +1,54 @@
-Bootstrap: docker 
-From: ubuntu:jammy-20240911.1
+Bootstrap: docker
+From: ubuntu:noble-20250404
 
 %labels
-
-    AUTHOR izaskun.mallona@gmail.com
+    Author izaskun.mallona@gmail.com
+    Author ben.uzh@proton.me
 
 %post
+    PYTHON_VERSION=3.12.6
+    PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
     
-    # Install python3.12
+    # Update and enable deb-src
+    apt-get update
+    echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list
+    echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
     apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git
-
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 4
-    make altinstall
 
-    # virtualenv
+    
+    # Get build dependencies for Python
+    apt-get build-dep -y python3
+
+    # Extra dependencies
+    apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev
+    
+    # Calculate half the number of available cores
+    HALF_NPROC=$(( $(nproc) / 2 ))
+    # Ensure at least one core is used
+    CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 ))
+    
+    # Download and build Python with optimizations
+    wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
+    tar -xf Python-${PYTHON_VERSION}.tgz
+    cd Python-${PYTHON_VERSION}*/
+    # Enable all possible optimizations
+    ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib"
+    make -j ${CORES_TO_USE}
+    make altinstall
+    
+    # Create virtualenv using the locally built Python
     cd /opt
-    python3.12 -m venv "default"
+    /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default"
     . default/bin/activate
-
-    pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43"
+    
+    # Install required packages
+    pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \
+      "isodate" "pydantic-core"  \
+      "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43"
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
 %environment
 
     . /opt/default/bin/activate
+

From 0056b7fce71ab1e5efc9456502c2114ea4d597d7 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:25:21 +0200
Subject: [PATCH 17/60] factorize sklearn singularity pip block

---
 envs/sklearn-pip.apptainer.include | 11 +++++++++++
 envs/sklearn_singularity.def       | 19 +++++++++----------
 2 files changed, 20 insertions(+), 10 deletions(-)
 create mode 100644 envs/sklearn-pip.apptainer.include

diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include
new file mode 100644
index 0000000..b8f48eb
--- /dev/null
+++ b/envs/sklearn-pip.apptainer.include
@@ -0,0 +1,11 @@
+    pip3 install \
+      "clustering-benchmarks==1.1.6" \
+      "fastcluster==1.2.6" \
+      "numpy==1.26.4" \
+      "scipy==1.14.1" \
+      "isodate" \
+      "pydantic-core"  \
+      "genieclust==1.1.6" \
+      "pandas==2.2.3" \
+      "gitpython==3.1.43" \
+      wget"
diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def
index 56bcf37..cb9a2f6 100644
--- a/envs/sklearn_singularity.def
+++ b/envs/sklearn_singularity.def
@@ -8,25 +8,25 @@ From: ubuntu:noble-20250404
 %post
     PYTHON_VERSION=3.12.6
     PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
-    
+
     # Update and enable deb-src
     apt-get update
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
     apt-get update
 
-    
+
     # Get build dependencies for Python
     apt-get build-dep -y python3
 
     # Extra dependencies
     apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev
-    
+
     # Calculate half the number of available cores
     HALF_NPROC=$(( $(nproc) / 2 ))
     # Ensure at least one core is used
     CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 ))
-    
+
     # Download and build Python with optimizations
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
@@ -35,16 +35,15 @@ From: ubuntu:noble-20250404
     ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib"
     make -j ${CORES_TO_USE}
     make altinstall
-    
+
     # Create virtualenv using the locally built Python
     cd /opt
     /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default"
     . default/bin/activate
-    
-    # Install required packages
-    pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \
-      "isodate" "pydantic-core"  \
-      "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43"
+
+    # Install required packages with pip
+
+    % include sklearn-pip.apptainer.include
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From cef3a6b6f0c0c0cb564941dee77eb52e9fd207db Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:27:48 +0200
Subject: [PATCH 18/60] extract variable in build script

---
 envs/build_singularity.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index c0c3d93..c5cbf6f 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
-singularity build sklearn.sif sklearn_singularity.def
-singularity build clustbench.sif clustbench_singularity.def
-singularity build r.sif r_singularity.def
-singularity build fcps.sif fcps_singularity.def
+CMD=singularity
+BUILD=build --fakeroot
+$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def
+$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def
+$CMD $BUILD r.sif r_singularity.def
+$CMD $BUILD fcps.sif fcps_singularity.def

From 2ee17ca636501521efb2c650a0b76750052692ee Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:37:19 +0200
Subject: [PATCH 19/60] revert include, should use m4

---
 envs/build_singularity.sh                            |  2 +-
 envs/sklearn-pip.apptainer.include                   | 11 -----------
 ...ularity.def => sklearn_singularity_optimized.def} | 12 +++++++++++-
 3 files changed, 12 insertions(+), 13 deletions(-)
 delete mode 100644 envs/sklearn-pip.apptainer.include
 rename envs/{sklearn_singularity.def => sklearn_singularity_optimized.def} (85%)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index c5cbf6f..61fbd13 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 CMD=singularity
 BUILD=build --fakeroot
-$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def
+$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def
 $CMD $BUILD clustbench-optimized.sif clustbench_singularity.def
 $CMD $BUILD r.sif r_singularity.def
 $CMD $BUILD fcps.sif fcps_singularity.def
diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include
deleted file mode 100644
index b8f48eb..0000000
--- a/envs/sklearn-pip.apptainer.include
+++ /dev/null
@@ -1,11 +0,0 @@
-    pip3 install \
-      "clustering-benchmarks==1.1.6" \
-      "fastcluster==1.2.6" \
-      "numpy==1.26.4" \
-      "scipy==1.14.1" \
-      "isodate" \
-      "pydantic-core"  \
-      "genieclust==1.1.6" \
-      "pandas==2.2.3" \
-      "gitpython==3.1.43" \
-      wget"
diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity_optimized.def
similarity index 85%
rename from envs/sklearn_singularity.def
rename to envs/sklearn_singularity_optimized.def
index cb9a2f6..6d6e165 100644
--- a/envs/sklearn_singularity.def
+++ b/envs/sklearn_singularity_optimized.def
@@ -43,7 +43,17 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    % include sklearn-pip.apptainer.include
+    pip3 install \
+      "clustering-benchmarks==1.1.6" \
+      "fastcluster==1.2.6" \
+      "numpy==1.26.4" \
+      "scipy==1.14.1" \
+      "isodate" \
+      "pydantic-core"  \
+      "genieclust==1.1.6" \
+      "pandas==2.2.3" \
+      "gitpython==3.1.43" \
+      wget"
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From c4cbe5c2f22ed52a4873d5a14781682a19e87a4f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:44:30 +0200
Subject: [PATCH 20/60] update python version

---
 envs/sklearn_singularity_optimized.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/sklearn_singularity_optimized.def b/envs/sklearn_singularity_optimized.def
index 6d6e165..17a131d 100644
--- a/envs/sklearn_singularity_optimized.def
+++ b/envs/sklearn_singularity_optimized.def
@@ -6,7 +6,7 @@ From: ubuntu:noble-20250404
     Author ben.uzh@proton.me
 
 %post
-    PYTHON_VERSION=3.12.6
+    PYTHON_VERSION=3.12.9
     PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
 
     # Update and enable deb-src

From 21bdd666d47d029d5463f814ab685389ec850f71 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:25:31 +0200
Subject: [PATCH 21/60] do a little bit of cleanup with the multiple envs

---
 ....yml => Clustering_apptainer_optimized.yml | 129 +++---------------
 Clustering_conda.yml                          | 125 +++--------------
 Clustering_envmodules.yml                     |  25 ++--
 envs/build_singularity.sh                     |   7 +-
 ...def => clustbench_apptainer_optimized.def} |  37 +++--
 envs/clustbench_apptainer_vanillapy.def       |  55 ++++++++
 envs/clustbench_singularity.def               |  35 -----
 ...ity.def => fcps_singularity_optimized.def} |  11 +-
 envs/sklearn.yml                              |  11 --
 9 files changed, 145 insertions(+), 290 deletions(-)
 rename Clustering_singularity.yml => Clustering_apptainer_optimized.yml (74%)
 rename envs/{sklearn_singularity_optimized.def => clustbench_apptainer_optimized.def} (71%)
 create mode 100644 envs/clustbench_apptainer_vanillapy.def
 delete mode 100644 envs/clustbench_singularity.def
 rename envs/{fcps_singularity.def => fcps_singularity_optimized.def} (79%)
 delete mode 100644 envs/sklearn.yml

diff --git a/Clustering_singularity.yml b/Clustering_apptainer_optimized.yml
similarity index 74%
rename from Clustering_singularity.yml
rename to Clustering_apptainer_optimized.yml
index c80b498..96e357e 100644
--- a/Clustering_singularity.yml
+++ b/Clustering_apptainer_optimized.yml
@@ -1,38 +1,32 @@
 id: clustering_example_apptainer
+
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: http://omnibenchmark.org:9000
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clusteringexampleapptainer
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
 software_backend: apptainer
+
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
-    envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
+    conda: envs/clustbench.yml # not used
+    envmodule: na
+    apptainer: envs/clustbench-optimized.sif
+
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: envs/rmarkdown.sif
+
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
+    envmodule: na
     apptainer: envs/fcps.sif
-    envmodule: fcps
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -45,10 +39,11 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
+
 stages:
-  ## clustbench data ##########################################################
 
   - id: data
+    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
@@ -229,89 +224,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"
diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 7ac1629..61352e1 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -1,38 +1,32 @@
 id: clustering_example_conda
+
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: http://omnibenchmark.org:9000
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clusteringexampleconda
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
 software_backend: conda
+
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
     conda: envs/clustbench.yml
     envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
+    apptainer: na
+
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
+    envmodule: fcps # not used
+    apptainer: na
+
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
-    apptainer: envs/fcps.sif
     envmodule: fcps
+    apptainer: na
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -45,6 +39,7 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
+
 stages:
   ## clustbench data ##########################################################
 
@@ -52,7 +47,7 @@ stages:
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
@@ -145,7 +140,7 @@ stages:
           - values: ["--linkage", "centroid"]
       - id: sklearn
         name: "sklearn"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
           #url: /home/imallona/src/clustbench_sklearn
@@ -229,89 +224,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"
diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index f37fd6c..52fb13e 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -1,28 +1,33 @@
 id: clustering_example_envmodules
+
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-benchmark_yaml_spec: 0.5
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
 
 software_backend: envmodules
 
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
+    conda: envs/clustbench.yml # not used
     envmodule: clustbench/0.1.0-foss-2023b
-    conda: envs/clustbench.yml
     apptainer: na
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmakrkdown.yml # not used
+    envmodule: rmarkdown
+    apptainer: na
+
   fcps:
     description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
     envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
-    conda: envs/fcps.yml
-    apptainer: na
-  rmarkdown:
-    description: "R with some plotting dependencies"
-    envmodule: rmarkdown # TODO
-    conda: envs/clustbench.yml
     apptainer: na
 
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 61fbd13..784e443 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,7 +1,6 @@
 #!/bin/sh
 CMD=singularity
 BUILD=build --fakeroot
-$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def
-$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def
-$CMD $BUILD r.sif r_singularity.def
-$CMD $BUILD fcps.sif fcps_singularity.def
+$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
+$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def
+$CMD $BUILD fcps.sif fcps_singularity_optimized.def
diff --git a/envs/sklearn_singularity_optimized.def b/envs/clustbench_apptainer_optimized.def
similarity index 71%
rename from envs/sklearn_singularity_optimized.def
rename to envs/clustbench_apptainer_optimized.def
index 17a131d..d4a316d 100644
--- a/envs/sklearn_singularity_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -15,7 +15,6 @@ From: ubuntu:noble-20250404
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
     apt-get update
 
-
     # Get build dependencies for Python
     apt-get build-dep -y python3
 
@@ -43,17 +42,39 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    pip3 install \
+    pip install -U pip
+
+    pip install \
       "clustering-benchmarks==1.1.6" \
-      "fastcluster==1.2.6" \
-      "numpy==1.26.4" \
-      "scipy==1.14.1" \
-      "isodate" \
-      "pydantic-core"  \
+      "contourpy==1.3.2" \
+      "cycler==0.12.1" \
+      "cython==3.1.0" \
+      "fonttools==4.58.0" \
       "genieclust==1.1.6" \
+      "joblib==1.5.0" \
+      "kiwisolver==1.4.8" \
+      "matplotlib==3.10.3" \
+      "natsort==8.4.0" \
+      "numpy==2.2.5" \
+      "packaging==25.0" \
       "pandas==2.2.3" \
+      "pillow==11.2.1" \
+      "pyparsing==3.2.3" \
+      "python-dateutil==2.9.0.post0" \
+      "pytz==2025.2" \
+      "scikit-learn==1.6.1" \
+      "scipy==1.15.3" \
+      "six==1.17.0" \
+      "threadpoolctl==3.6.0" \
+      "tzdata==2025.2" \
+      "fastcluster==1.2.6" \
       "gitpython==3.1.43" \
-      wget"
+      "isodate==0.7.2" \
+      "pydantic-core==2.34.1"
+
+    # TODO: can we use something more maintained?
+    pip install --pre "python3-wget==0.0.2-beta1"
+
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
new file mode 100644
index 0000000..1f2b4e3
--- /dev/null
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -0,0 +1,55 @@
+Bootstrap: docker
+From: ubuntu:noble-20250404
+
+%labels
+    Author izaskun.mallona@gmail.com
+    Author ben.uzh@proton.me
+
+%post
+    # Create virtualenv using the default Python
+    mkdir -p /opt && cd /opt
+    python3.12 -m venv "default"
+    . default/bin/activate
+
+    # Install required packages with pip
+
+    pip install -U pip
+
+    pip install \
+      "clustering-benchmarks==1.1.6" \
+      "contourpy==1.3.2" \
+      "cycler==0.12.1" \
+      "cython==3.1.0" \
+      "fonttools==4.58.0" \
+      "genieclust==1.1.6" \
+      "joblib==1.5.0" \
+      "kiwisolver==1.4.8" \
+      "matplotlib==3.10.3" \
+      "natsort==8.4.0" \
+      "numpy==2.2.5" \
+      "packaging==25.0" \
+      "pandas==2.2.3" \
+      "pillow==11.2.1" \
+      "pyparsing==3.2.3" \
+      "python-dateutil==2.9.0.post0" \
+      "pytz==2025.2" \
+      "scikit-learn==1.6.1" \
+      "scipy==1.15.3" \
+      "six==1.17.0" \
+      "threadpoolctl==3.6.0" \
+      "tzdata==2025.2" \
+      "fastcluster==1.2.6" \
+      "gitpython==3.1.43" \
+      "isodate==0.7.2" \
+      "pydantic-core==2.34.1"
+
+    # TODO: can we use something more maintained?
+    pip install --pre "python3-wget==0.0.2-beta1"
+
+
+    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
+
+%environment
+
+    . /opt/default/bin/activate
+
diff --git a/envs/clustbench_singularity.def b/envs/clustbench_singularity.def
deleted file mode 100644
index 8c2ae85..0000000
--- a/envs/clustbench_singularity.def
+++ /dev/null
@@ -1,35 +0,0 @@
-Bootstrap: docker 
-From: ubuntu:jammy-20240911.1
-
-%labels
-
-    AUTHOR izaskun.mallona@gmail.com
-
-%post
-    
-    # Install python3.12
-    apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git
-
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 4
-    make altinstall
-
-    # virtualenv
-    cd /opt
-    python3.12 -m venv "default"
-    . default/bin/activate
-    
-    pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \
-      "isodate" "pydantic-core"  \
-      "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43"
-
-    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
-
-%environment
-
-    . /opt/default/bin/activate
diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity_optimized.def
similarity index 79%
rename from envs/fcps_singularity.def
rename to envs/fcps_singularity_optimized.def
index a4a615e..6362b9e 100644
--- a/envs/fcps_singularity.def
+++ b/envs/fcps_singularity_optimized.def
@@ -4,6 +4,7 @@ From: rocker/tidyverse:4.3.3
 %labels
 
     AUTHOR izaskun.mallona@gmail.com
+    AUTHOR ben.uzh@proton.me
 
 %post
 
@@ -13,11 +14,11 @@ From: rocker/tidyverse:4.3.3
         libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \
         libgsl-dev
 
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
+    wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz
+    tar -xf Python-3.12.9.tgz
     cd Python-3.12.*/
     ./configure --enable-optimizations
-    make -j 4
+    make -j 8
     make altinstall
 
     # virtualenv
@@ -25,13 +26,15 @@ From: rocker/tidyverse:4.3.3
     python3.12 -m venv "default"
     . default/bin/activate
 
+    # TODO: pin dependencies
     pip install gitpython==3.1.43 isodate pydantic-core
 
     ## no versioning here
+    ## TODO(ben): get same versions as in easyconfig
     Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
 %environment
-    
+
     . /opt/default/bin/activate
diff --git a/envs/sklearn.yml b/envs/sklearn.yml
deleted file mode 100644
index 258b7ea..0000000
--- a/envs/sklearn.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-name: sklearn
-channels:
-  - conda-forge
-  - nodefaults
-dependencies:
-  - conda-forge::python=3.12.6
-  - conda-forge::scikit-learn
-  - conda-forge::pip
-  - pip:
-    - "pandas"
-    - "argparse"

From e8e0f7eb2313696f7494e65c20d44def8301de63 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:29:28 +0200
Subject: [PATCH 22/60] escape

---
 envs/build_singularity.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 784e443..2dae40a 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 CMD=singularity
-BUILD=build --fakeroot
-$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
-$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def
-$CMD $BUILD fcps.sif fcps_singularity_optimized.def
+BUILD='build --fakeroot'
+$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
+$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
+$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def

From a8336fba907ae43ce16678de97a22538addb06e6 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:42:08 +0200
Subject: [PATCH 23/60] install updated python

---
 envs/clustbench_apptainer_vanillapy.def | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index 1f2b4e3..5d388bf 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -6,9 +6,19 @@ From: ubuntu:noble-20250404
     Author ben.uzh@proton.me
 
 %post
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update && \
+        apt-get install -y \
+        python3 \
+        python3-venv \
+        python3-pip \
+        ca-certificates \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+
     # Create virtualenv using the default Python
     mkdir -p /opt && cd /opt
-    python3.12 -m venv "default"
+    /usr/bin/python3 -m venv "default"
     . default/bin/activate
 
     # Install required packages with pip
@@ -46,6 +56,8 @@ From: ubuntu:noble-20250404
     # TODO: can we use something more maintained?
     pip install --pre "python3-wget==0.0.2-beta1"
 
+    # Do some cleanup to keep the image slim
+    rm -rf ~/.cache
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From 518c2f6c894b097855a32f7e810783b13e9ec386 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:46:47 +0200
Subject: [PATCH 24/60] sync the two build recipes

---
 envs/clustbench_apptainer_optimized.def | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index d4a316d..1e934a8 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -10,6 +10,7 @@ From: ubuntu:noble-20250404
     PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
 
     # Update and enable deb-src
+    export DEBIAN_FRONTEND=noninteractive
     apt-get update
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
@@ -19,19 +20,33 @@ From: ubuntu:noble-20250404
     apt-get build-dep -y python3
 
     # Extra dependencies
-    apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev
+    apt-get install -y git \
+        python-is-python3 \
+        wget \
+        zlib1g-dev \
+        libbz2-dev \
+        libssl-dev \
+        libffi-dev \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
 
     # Calculate half the number of available cores
     HALF_NPROC=$(( $(nproc) / 2 ))
     # Ensure at least one core is used
     CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 ))
 
-    # Download and build Python with optimizations
+    # Download and build Python from source, with optimizations
+
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
     cd Python-${PYTHON_VERSION}*/
+
     # Enable all possible optimizations
-    ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib"
+    ./configure \
+        --enable-optimizations \
+        --with-lto \
+        --enable-shared \
+        LDFLAGS="-Wl,-rpath /usr/local/lib"
     make -j ${CORES_TO_USE}
     make altinstall
 
@@ -75,6 +90,8 @@ From: ubuntu:noble-20250404
     # TODO: can we use something more maintained?
     pip install --pre "python3-wget==0.0.2-beta1"
 
+    # Do some cleanup to keep the image slim
+    rm -rf ~/.cache
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From 2f4131f08f967bf0b934a25907e21bb9d54c001c Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:09:01 +0200
Subject: [PATCH 25/60] delete source folder

---
 envs/clustbench_apptainer_optimized.def | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index 1e934a8..eda9ea6 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -37,6 +37,7 @@ From: ubuntu:noble-20250404
 
     # Download and build Python from source, with optimizations
 
+    mkdir ~/src && cd src
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
     cd Python-${PYTHON_VERSION}*/
@@ -92,6 +93,7 @@ From: ubuntu:noble-20250404
 
     # Do some cleanup to keep the image slim
     rm -rf ~/.cache
+    rm -rf ~/src
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From c72eb273f395e9c5805ee0806cc247a77b783443 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:15:59 +0200
Subject: [PATCH 26/60] add microbenchmark for numpy operations

---
 microbenchmark/microbench.py | 67 ++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 microbenchmark/microbench.py

diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py
new file mode 100644
index 0000000..3730f9e
--- /dev/null
+++ b/microbenchmark/microbench.py
@@ -0,0 +1,67 @@
+"""
+This script exercises a few common linear algebra operations in numpy.
+It's intended mostly to gauge whether it makes sense to descend into
+compiler optimizations for the Python binary that we ship within the SIF images,
+but it can be easily repurposed for other specific microbenchmarks (i.e., numba or GPU perf gains).
+
+Be aware that here we're profiling simple operations; it would make sense to carefully
+profile the libraries of interest to see where the computational bottlenecks really are.
+
+Usage:
+
+singularity exec clustbench-vanilla.sif python3 microbench.py
+singularity exec clustbench-optimized.sif python3 microbench.py
+"""
+import numpy as np
+import time
+import json
+from statistics import mean, stdev
+
+def run_operation(operation, func, repetitions):
+    timings = []
+    for _ in range(repetitions):
+        start = time.perf_counter()
+        func()
+        elapsed = time.perf_counter() - start
+        timings.append(elapsed)
+    return {
+        'operation': operation,
+        'mean': mean(timings),
+        'stdev': stdev(timings),
+        'runs': repetitions
+    }
+
+def benchmark(repetitions=50):
+    np.random.seed(42)
+    size = 1000
+
+    # Create random matrices
+    A = np.random.rand(size, size)
+    B = np.random.rand(size, size)
+    C = A @ A.T  # Ensure positive definite for Cholesky
+
+    # Define operations
+    operations = [
+        ('mat_mul', lambda: np.dot(A, B)),
+        ('svd', lambda: np.linalg.svd(A)),
+        ('chol_decomp', lambda: np.linalg.cholesky(C))
+    ]
+
+    results = []
+    for operation, func in operations:
+        try:
+            result = run_operation(operation, func, repetitions)
+        except np.linalg.LinAlgError:
+            result = {
+                'operation': operation,
+                'error': 'Operation failed due to numerical instability'
+            }
+        results.append(result)
+
+    # Output results as JSON
+    print(json.dumps(results, indent=2))
+
+if __name__ == "__main__":
+    import sys
+    repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10
+    benchmark(repetitions)

From 937e45599633e3a58af28beece19f038a4fd9513 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:16:41 +0200
Subject: [PATCH 27/60] fix path

---
 envs/clustbench_apptainer_optimized.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index eda9ea6..19726c2 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -37,7 +37,7 @@ From: ubuntu:noble-20250404
 
     # Download and build Python from source, with optimizations
 
-    mkdir ~/src && cd src
+    mkdir ~/src && cd ~/src
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
     cd Python-${PYTHON_VERSION}*/

From b0bd85adfed66583b676a3f378f0577701efa5a5 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:24:30 +0200
Subject: [PATCH 28/60] default reps

---
 microbenchmark/microbench.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py
index 3730f9e..314e66b 100644
--- a/microbenchmark/microbench.py
+++ b/microbenchmark/microbench.py
@@ -17,6 +17,8 @@
 import json
 from statistics import mean, stdev
 
+DEFAULT_REPETITIONS = 10
+
 def run_operation(operation, func, repetitions):
     timings = []
     for _ in range(repetitions):
@@ -31,7 +33,7 @@ def run_operation(operation, func, repetitions):
         'runs': repetitions
     }
 
-def benchmark(repetitions=50):
+def benchmark(repetitions=DEFAULT_REPETITIONS):
     np.random.seed(42)
     size = 1000
 
@@ -63,5 +65,5 @@ def benchmark(repetitions=50):
 
 if __name__ == "__main__":
     import sys
-    repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10
+    repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_REPETITIONS
     benchmark(repetitions)

From 83f9b07dfcbf25dbfce67d17d92eedbe469e2bd9 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:40:30 +0200
Subject: [PATCH 29/60] refs

---
 microbenchmark/microbench.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py
index 314e66b..6abc6ee 100644
--- a/microbenchmark/microbench.py
+++ b/microbenchmark/microbench.py
@@ -11,6 +11,8 @@
 
 singularity exec clustbench-vanilla.sif python3 microbench.py
 singularity exec clustbench-optimized.sif python3 microbench.py
+
+References: https://pythonspeed.com/articles/faster-python/
 """
 import numpy as np
 import time

From 744c978643ad5623de1c4b176ab887e7a6127739 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:11:30 +0200
Subject: [PATCH 30/60] duplicate the apptainer clustering yaml

---
 Clustering_apptainer_optimized.yml |  39 +++--
 Clustering_apptainer_vanilla.yml   | 223 +++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+), 21 deletions(-)
 create mode 100644 Clustering_apptainer_vanilla.yml

diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml
index 96e357e..a073683 100644
--- a/Clustering_apptainer_optimized.yml
+++ b/Clustering_apptainer_optimized.yml
@@ -1,6 +1,6 @@
-id: clustering_example_apptainer
-
+id: clustering_example_apptainer_optimized
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+
 version: 1.5
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
@@ -10,27 +10,28 @@ software_backend: apptainer
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.6"
+    description: "clustbench on py3.12.9, optimized python build"
     conda: envs/clustbench.yml # not used
     envmodule: na
     apptainer: envs/clustbench-optimized.sif
 
+  fcps:
+    description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
+    envmodule: na
+    apptainer: envs/fcps.sif
+
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml # not used
     envmodule: na
     apptainer: envs/rmarkdown.sif
 
-  fcps:
-    description: "CRAN's FCPS"
-    conda: envs/fcps.yml
-    envmodule: na
-    apptainer: envs/fcps.sif
 
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
+    software_environment: rmarkdown
     repository:
       url: https://github.com/imallona/clustering_report
       commit: 1d6bdf5
@@ -43,11 +44,10 @@ metric_collectors:
 stages:
 
   - id: data
-    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
@@ -120,16 +120,13 @@ stages:
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
 
-  ## clustbench methods (fastcluster) ###################################################################
-  
   - id: clustering
     modules:
       - id: fastcluster
         name: "fastcluster algorithm"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
-          # url: /home/imallona/src/clustbench_fastcluster/
           commit: "45e43d3"
         parameters:
           - values: ["--linkage", "complete"]
@@ -138,12 +135,12 @@ stages:
           - values: ["--linkage", "weighted"]
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
+
       - id: sklearn
-        name: "sklearn"
-        software_environment: "clustbench"
+        name: sklearn
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
           commit: 5877378
         parameters:
           - values: ["--method", "birch"]
@@ -161,8 +158,8 @@ stages:
           - values: ["--linkage", "complete"]
           - values: ["--linkage", "ward"]
       - id: genieclust
-        name: "genieclust"
-        software_environment: "clustbench"
+        name: genieclust
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
           commit: 6090043
@@ -201,7 +198,7 @@ stages:
     modules:
       - id: partition_metrics
         name: "clustbench partition metrics"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
           commit: 9132d45
diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
new file mode 100644
index 0000000..46b8ea4
--- /dev/null
+++ b/Clustering_apptainer_vanilla.yml
@@ -0,0 +1,223 @@
+id: clustering_example_apptainer_vanilla
+
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
+software_backend: apptainer
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.6"
+    conda: envs/clustbench.yml # not used
+    envmodule: na
+    apptainer: envs/clustbench-vanilla.sif
+
+  fcps:
+    description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
+    envmodule: na
+    apptainer: envs/fcps.sif
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: envs/rmarkdown.sif
+
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #	7	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] #	3	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] #	2, 6	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #	4	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] #	2, 4, 5	6
+          - values: ["--dataset_generator", "graves", "--dataset_name", "line"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #	2, 4	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #	2, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #	3, 5	2
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #	3	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] #	3	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "square"] #	2	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #	7	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] #	4, 5, 6	5
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #	2	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] #	2	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #	3, 4	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #	8, 9, 15	3
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] #	3	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #	8	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] #	8	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] #	7	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] #	3	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] #	10	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] #	10	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] #	6	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] #	4, 6	2
+          - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] #	4	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+          - values: ["--linkage", "average"]
+          - values: ["--linkage", "weighted"]
+          - values: ["--linkage", "median"]
+          - values: ["--linkage", "centroid"]
+
+      - id: sklearn
+        name: sklearn
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+          # - values: ["--method", "spectral"] ## too slow
+          - values: ["--method", "gm"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: "clustbench"
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "average"]
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: genieclust
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+          - values: ["--method", "gic"]
+          - values: ["--method", "ica"]
+      - id: fcps
+        name: "fcps"
+        software_environment: "fcps"
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda
+          - values: ["--method", "FCPS_Minimax"]
+          - values: ["--method", "FCPS_MinEnergy"]
+          - values: ["--method", "FCPS_HDBSCAN_2"]
+          - values: ["--method", "FCPS_HDBSCAN_4"]
+          - values: ["--method", "FCPS_HDBSCAN_8"]
+          - values: ["--method", "FCPS_Diana"]
+          - values: ["--method", "FCPS_Fanny"]
+          - values: ["--method", "FCPS_Hardcl"]
+          - values: ["--method", "FCPS_Softcl"]
+          - values: ["--method", "FCPS_Clara"]
+          - values: ["--method", "FCPS_PAM"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 9132d45
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+          - values: ["--metric", "adjusted_mi_score"]
+          - values: ["--metric", "adjusted_rand_score"]
+          - values: ["--metric", "fm_score"]
+          - values: ["--metric", "mi_score"]
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "normalized_mi_score"]
+          - values: ["--metric", "normalized_pivoted_accuracy"]
+          - values: ["--metric", "pair_sets_index"]
+          - values: ["--metric", "rand_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"

From ec18dcf21ce6d23a1c20f88a5431d9a2c040abae Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:15:36 +0200
Subject: [PATCH 31/60] update the oras yaml. not working, just to keep in sync

---
 Clustering_oras.yml | 128 ++++++++------------------------------------
 1 file changed, 22 insertions(+), 106 deletions(-)

diff --git a/Clustering_oras.yml b/Clustering_oras.yml
index 6640461..c6f0d7e 100644
--- a/Clustering_oras.yml
+++ b/Clustering_oras.yml
@@ -1,36 +1,37 @@
-id: clustering_example
+id: clustering_example_oras
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs.
-version: 1.2
+version: 1.5
+
 benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: https://play.min.io
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clustering_example
+benchmark_yaml_spec: 0.4
+
+#storage: https://play.min.io
+#storage_api: S3
+#storage_bucket_name: clustering_example
+
 software_backend: apptainer
+
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
+    conda: envs/clustbench.yml # not used
     envmodule: clustbench
     apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest
-    envmodule: fcps # not true, but
+
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml
+    conda: envs/fcps.yml # not used
+    envmodule: na
     apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest
-    envmodule: fcps
-stages:
 
-  ## clustbench data ##########################################################
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest
+
+stages:
 
   - id: data
     modules:
@@ -214,88 +215,3 @@ stages:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
 
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"

From cf52a2c7b3595e25488b1f0a007e3d30045fb74b Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:21:02 +0200
Subject: [PATCH 32/60] update the rmarkdown environment

---
 Clustering_envmodules.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 52fb13e..a2112d4 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -18,7 +18,7 @@ software_environments:
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmakrkdown.yml # not used
-    envmodule: rmarkdown
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
     apptainer: na
 
   fcps:
@@ -44,7 +44,6 @@ metric_collectors:
 stages:
 
   - id: data
-    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"

From 934ce8baa625f2877a79958f6091fbd4eae4b96f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:34:49 +0200
Subject: [PATCH 33/60] update makefile

---
 Clustering_conda.yml                |  12 +--
 Clustering_conda_smoketest.yml      | 129 +++++++++++++++++++++++++++
 Clustering_envmodules_smoketest.yml | 131 ++++++++++++++++++++++++++++
 Makefile                            |  23 ++++-
 envs/rmarkdown.yml                  |   4 +-
 5 files changed, 289 insertions(+), 10 deletions(-)
 create mode 100644 Clustering_conda_smoketest.yml
 create mode 100644 Clustering_envmodules_smoketest.yml

diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 61352e1..7822761 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -15,18 +15,18 @@ software_environments:
     envmodule: clustbench
     apptainer: na
 
-  rmarkdown:
-    description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    envmodule: fcps # not used
-    apptainer: na
-
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
     envmodule: fcps
     apptainer: na
 
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
+    envmodule: fcps # not used
+    apptainer: na
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml
new file mode 100644
index 0000000..15215d7
--- /dev/null
+++ b/Clustering_conda_smoketest.yml
@@ -0,0 +1,129 @@
+id: clustering_example_envmodules
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.4
+benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmark_yaml_spec: 0.5
+
+software_backend: conda
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.6"
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: na
+
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
+    apptainer: na
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    conda: envs/rmarkdown.yml
+    apptainer: na
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml
new file mode 100644
index 0000000..3fa8e81
--- /dev/null
+++ b/Clustering_envmodules_smoketest.yml
@@ -0,0 +1,131 @@
+id: clustering_example_envmodules
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.4
+benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmark_yaml_spec: 0.5
+
+software_backend: envmodules
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.6"
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: na
+
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
+    apptainer: na
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    conda: envs/clustbench.yml
+    apptainer: na
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  ## clustbench methods (fastcluster) ###################################################################
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
diff --git a/Makefile b/Makefile
index 875a375..e8e942d 100644
--- a/Makefile
+++ b/Makefile
@@ -6,9 +6,26 @@ prepare_apptainer_env:
 prepare_envmodules_env:
 	cd envs && eb clustbench.eb --robot
 	cd envs && eb fcps.eb --robot
-run_with_apptainer_backend:
-	 ${OB_CMD} -b Clustering_singularity.yml
-	 mv out out_apptainer
+	cd envs && eb rmarkdown.eb --robot
+
+# short versions, to debug runs & environments
+run_with_apptainer_backend_short:
+	 ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml
+	 mv out out_apptainer_short
+run_with_conda_backend_short:
+	 ${OB_CMD} -b Clustering_conda_smoketest.yml
+	 mv out out_conda
+run_with_envmodules_backend_short:
+	 ${OB_CMD} -b Clustering_envmodules_smoketest.yml
+	 mv out out_lmod_short
+
+# full versions (expect hours)
+run_with_apptainer_backend_vanilla:
+	 ${OB_CMD} -b Clustering_apptainer_vanilla.yml
+	 mv out out_apptainer_vanilla
+run_with_apptainer_backend_optimized:
+	 ${OB_CMD} -b Clustering_apptainer_optimized.yml
+	 mv out out_apptainer_vanilla
 run_with_conda_backend:
 	 ${OB_CMD} -b Clustering_conda.yml
 	 mv out out_conda
diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml
index e57969e..ed5c65e 100644
--- a/envs/rmarkdown.yml
+++ b/envs/rmarkdown.yml
@@ -7,6 +7,8 @@ dependencies:
   - conda-forge::python=3.12.6
   - conda-forge::r-argparse
   - conda-forge::r-rmarkdown
+  - conda-forge::r-cairo
+  - conda-forge::r-svglite
   - conda-forge::r-ggplot2
-  - conda-forge::r-tidyr  
+  - conda-forge::r-tidyr
   - bioconda::bioconductor-complexheatmap

From 3890cb48664570aa7a9878dacb299146d69ced5d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:15:38 +0200
Subject: [PATCH 34/60] add apptainer definition for rmarkdown

---
 envs/build_singularity.sh |  4 +++-
 envs/rmarkdown.def        | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 envs/rmarkdown.def

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 2dae40a..c34208b 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -2,5 +2,7 @@
 CMD=singularity
 BUILD='build --fakeroot'
 $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
-$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
+# enable this if you want to compare with the custom python compilation
+# $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
 $CMD ${BUILD} fcps.sif fcps_singularity_optimized.def
+$CMD ${BUILD} rmarkdown.sif rmarkdown.def
diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
new file mode 100644
index 0000000..ce7ca1e
--- /dev/null
+++ b/envs/rmarkdown.def
@@ -0,0 +1,38 @@
+Bootstrap: docker
+From: rocker/tidyverse:4.4
+
+%labels
+
+    AUTHOR izaskun.mallona@gmail.com
+    AUTHOR ben.uzh@proton.me
+
+%post
+
+    # Install python (3.12 as of noble)
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update
+    apt-get install -y git \
+        python-is-python3 \
+        python3.12 \
+        python3-virtualenv \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+
+    # virtualenv
+    cd /opt
+    python3.12 -m venv "default"
+    . default/bin/activate
+
+    pip install \
+        "gitpython==3.1.43" \
+        "isodate==0.7.2" \
+        "pydantic-core==2.34.1"
+
+    # Install R packages
+    Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))'
+
+    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
+
+%environment
+
+    . /opt/default/bin/activate

From c80adc10844d9251572f00352795be17c01a61a3 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:29:08 +0200
Subject: [PATCH 35/60] remove unneeded dependencies

---
 envs/rmarkdown.def | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
index ce7ca1e..8dc75b6 100644
--- a/envs/rmarkdown.def
+++ b/envs/rmarkdown.def
@@ -14,7 +14,7 @@ From: rocker/tidyverse:4.4
     apt-get install -y git \
         python-is-python3 \
         python3.12 \
-        python3-virtualenv \
+        python3.12-venv \
         && apt-get clean && \
         rm -rf /var/lib/apt/lists/*
 
@@ -29,7 +29,7 @@ From: rocker/tidyverse:4.4
         "pydantic-core==2.34.1"
 
     # Install R packages
-    Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))'
+    Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From b19a489cec78d49c57b1c2b9e6cf1c3b0604c1ca Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:31:01 +0200
Subject: [PATCH 36/60] update makefile

---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index e8e942d..f342949 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
 MAX_CORES ?= 10
+TIMEOUT ?= 4h
+
 # by default, we want to run all snakemake rules even if there are failures (-k)
-OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES}
+OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES}
+
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 prepare_envmodules_env:
@@ -14,7 +17,7 @@ run_with_apptainer_backend_short:
 	 mv out out_apptainer_short
 run_with_conda_backend_short:
 	 ${OB_CMD} -b Clustering_conda_smoketest.yml
-	 mv out out_conda
+	 mv out out_conda_short
 run_with_envmodules_backend_short:
 	 ${OB_CMD} -b Clustering_envmodules_smoketest.yml
 	 mv out out_lmod_short

From ebd69b79937e55a68e968d829910c5c6f3d80b70 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:38:13 +0200
Subject: [PATCH 37/60] cleanup r/fcps deps

---
 Clustering_conda.yml                          |  2 +-
 envs/build_singularity.sh                     |  2 +-
 ...cps_singularity_optimized.def => fcps.def} | 29 ++++++++-------
 envs/fcps.eb                                  |  3 +-
 envs/r.yml                                    | 12 ------
 envs/r_singularity.def                        | 37 -------------------
 6 files changed, 19 insertions(+), 66 deletions(-)
 rename envs/{fcps_singularity_optimized.def => fcps.def} (59%)
 delete mode 100644 envs/r.yml
 delete mode 100644 envs/r_singularity.def

diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 7822761..9e74ee5 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -30,7 +30,7 @@ software_environments:
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
+    software_environment: rmarkdown
     repository:
       url: https://github.com/imallona/clustering_report
       commit: 1d6bdf5
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index c34208b..f8596a7 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -4,5 +4,5 @@ BUILD='build --fakeroot'
 $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
 # enable this if you want to compare with the custom python compilation
 # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
-$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def
+$CMD ${BUILD} fcps.sif fcps.def
 $CMD ${BUILD} rmarkdown.sif rmarkdown.def
diff --git a/envs/fcps_singularity_optimized.def b/envs/fcps.def
similarity index 59%
rename from envs/fcps_singularity_optimized.def
rename to envs/fcps.def
index 6362b9e..f4eefcb 100644
--- a/envs/fcps_singularity_optimized.def
+++ b/envs/fcps.def
@@ -1,5 +1,5 @@
 Bootstrap: docker
-From: rocker/tidyverse:4.3.3
+From: rocker/tidyverse:4.4
 
 %labels
 
@@ -8,29 +8,32 @@ From: rocker/tidyverse:4.3.3
 
 %post
 
-    # Install python3.12
+    # Install python (3.12 as of noble)
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update
+    apt-get install -y git \
+        python-is-python3 \
+        python3.12 \
+        python3.12-venv \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
     apt-get update
     apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
         libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \
         libgsl-dev
 
-    wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz
-    tar -xf Python-3.12.9.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 8
-    make altinstall
-
     # virtualenv
     cd /opt
     python3.12 -m venv "default"
     . default/bin/activate
 
-    # TODO: pin dependencies
-    pip install gitpython==3.1.43 isodate pydantic-core
+    pip install \
+        "gitpython==3.1.43" \
+        "isodate==0.7.2" \
+        "pydantic-core==2.34.1"
 
-    ## no versioning here
-    ## TODO(ben): get same versions as in easyconfig
+    # Install R packages
+    ## FIXME no versioning here
     Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
diff --git a/envs/fcps.eb b/envs/fcps.eb
index 54c8c7d..4d86bdd 100644
--- a/envs/fcps.eb
+++ b/envs/fcps.eb
@@ -15,12 +15,11 @@ dependencies = [
     ('R', '4.3.2'),
     ('Boost', '1.82.0'), 
     ('GSL', '2.7'),
-#    ('arrow-R', '14.0.1', versionsuffix),  # required by RcisTarget
 ]
 
 exts_default_options = {
     'source_urls': [
-	'https://bioconductor.org/packages/release/bioc/src/contrib/',
+       'https://bioconductor.org/packages/release/bioc/src/contrib/',
         'https://cran.r-project.org/src/contrib/Archive/%(name)s',  # package archive
         'https://cran.r-project.org/src/contrib/',  # current version of packages
         'https://cran.freestatistics.org/src/contrib',  # mirror alternative for current packages
diff --git a/envs/r.yml b/envs/r.yml
deleted file mode 100644
index 456e139..0000000
--- a/envs/r.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: r_for_metrics
-channels:
-  - conda-forge
-  - nodefaults
-dependencies:
-  - conda-forge::python=3.12.6
-  - conda-forge::r-mclust
-  - conda-forge::r-caret
-  - conda-forge::r-dplyr
-  - conda-forge::r-readr
-  - conda-forge::r-argparse
-  
diff --git a/envs/r_singularity.def b/envs/r_singularity.def
deleted file mode 100644
index f1f9ec9..0000000
--- a/envs/r_singularity.def
+++ /dev/null
@@ -1,37 +0,0 @@
-Bootstrap: docker
-From: rocker/tidyverse:4.4
-
-%labels
-
-    AUTHOR izaskun.mallona@gmail.com
-
-%post
-
-    # Install python3.12
-    apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git
-
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 4
-    make altinstall
-
-    # virtualenv
-    cd /opt
-    python3.12 -m venv "default"
-    . default/bin/activate
-
-    pip install gitpython==3.1.43 isodate pydantic-core
-
-    # Install R packages
-    
-    Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))'
-
-    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
-
-%environment
-
-    . /opt/default/bin/activate

From 1afaa2f2830f11563973a1ef9720753b0a47ceec Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:44:36 +0200
Subject: [PATCH 38/60] cleanup image

---
 envs/build_singularity.sh | 2 +-
 envs/fcps.def             | 4 ----
 envs/rmarkdown.def        | 2 ++
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index f8596a7..83203c8 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -5,4 +5,4 @@ $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
 # enable this if you want to compare with the custom python compilation
 # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
 $CMD ${BUILD} fcps.sif fcps.def
-$CMD ${BUILD} rmarkdown.sif rmarkdown.def
+$CMD ${BUILD} rmarkdown.sif rmarkdown.def  # this one is very similar to fcps, remove
diff --git a/envs/fcps.def b/envs/fcps.def
index f4eefcb..922d7f8 100644
--- a/envs/fcps.def
+++ b/envs/fcps.def
@@ -17,10 +17,6 @@ From: rocker/tidyverse:4.4
         python3.12-venv \
         && apt-get clean && \
         rm -rf /var/lib/apt/lists/*
-    apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \
-        libgsl-dev
 
     # virtualenv
     cd /opt
diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
index 8dc75b6..aa20cc1 100644
--- a/envs/rmarkdown.def
+++ b/envs/rmarkdown.def
@@ -1,6 +1,8 @@
 Bootstrap: docker
 From: rocker/tidyverse:4.4
 
+# TODO: we could merge this one with fcps.def, no need to duplicate the image.
+
 %labels
 
     AUTHOR izaskun.mallona@gmail.com

From 9e2168a754e7a93e11867f68f4548f1415301c79 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:54:57 +0200
Subject: [PATCH 39/60] update readme

---
 envs/README.md | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/envs/README.md b/envs/README.md
index 69aa5c1..3cab925 100644
--- a/envs/README.md
+++ b/envs/README.md
@@ -1,10 +1,9 @@
 We distribute `Clustering.yml` runs with different backends.
 
-- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip)
-- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files.
-- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images.
-- `Clustering_envmodules.yml`. Easybuilt with default optimization.
-
+- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip)
+- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files.
+- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry.
+- `Clustering_envmodules.yml`. Easybuild backend with default optimization.
 
 ## Conda
 
@@ -12,8 +11,7 @@ We distribute `Clustering.yml` runs with different backends.
 
 - `clustbench.yml`
 - `fcps.yml`
-- `r.yml`
-- `sklearn.yml`
+- `rmarkdown.yml`
 
 ### How to build
 
@@ -23,24 +21,25 @@ No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clusterin
 
 ### Files
 
-- `clustbench_singularity.def`
-- `fcps_singularity.def`
-- `r_singularity.def`
-- `sklearn_singularity.def`
+The apptainer images are based in ubuntu-noble docker images.
+
+The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image.
+
+- `clustbench_apptainer_optimized.def`
+- `clustbench_apptainer_vanillapy.def`
+- `fcps.def`
+- `rmarkdown.def`
 
 ### How to build
 
-- `build_singularity.sh`
+- `make prepare_apptainer_env` from the root folder.
 
 ## Aptainer semi-reproducible and remote
 
-No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry.
+TODO: push to the registry (how?)
 
-## Apptainer (reproducible) with easybuild
-
-Doing...
+No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry.
 
-Lorem ipsum.
 
 ## envmodules - reproducible builds with easybuild
 
@@ -48,11 +47,11 @@ Lorem ipsum.
 
 - `clustbench.eb`
 - `fcps.eb`
+- `rmarkdown.eb`
+- `rmarkdown-python.eb`
 
 ### How to build
 
-1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2
-2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this
-3. `python3-wget` from pypi doesn't look very well maintaned
-4. `eb fcps.eb --robot`
-5. `eb clustbench.eb --robot`
+- `make prepare_envmodules_env` from the root folder.
+- `python3-wget` from pypi doesn't look very well maintaned
+

From 6199c0a11bbc88a944d07e4b79bf329fc9c55990 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:00:25 +0200
Subject: [PATCH 40/60] fixes

---
 envs/clustbench.eb | 5 -----
 envs/fcps.eb       | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index 0e86911..daae6dd 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -13,9 +13,6 @@ dependencies = [
     ('SciPy-bundle', '2023.11'),
     ('matplotlib', '3.8.2'),
     ('scikit-learn', '1.4.0'),
-# FIXME: I think this is not needed -- ben
-#    ('meson-python', '0.15.0'),
-#    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
 ]
 
 exts_list = [
@@ -48,5 +45,3 @@ exts_list = [
 ]
 
 moduleclass = 'bio'
-
-
diff --git a/envs/fcps.eb b/envs/fcps.eb
index 4d86bdd..692bf0b 100644
--- a/envs/fcps.eb
+++ b/envs/fcps.eb
@@ -13,7 +13,7 @@ builddependencies = [('pkgconf', '1.9.5')]
 
 dependencies = [
     ('R', '4.3.2'),
-    ('Boost', '1.82.0'), 
+    ('Boost', '1.82.0'),
     ('GSL', '2.7'),
 ]
 

From b017cb02a71b83766f831b9bf5b4d483eb8dbe9f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:18:33 +0200
Subject: [PATCH 41/60] apptainer smoketest

---
 Clustering_apptainer_vanilla.yml           |   4 +-
 Clustering_apptainer_vanilla_smoketest.yml | 129 +++++++++++++++++++++
 2 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 Clustering_apptainer_vanilla_smoketest.yml

diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
index 46b8ea4..6bc5edd 100644
--- a/Clustering_apptainer_vanilla.yml
+++ b/Clustering_apptainer_vanilla.yml
@@ -10,9 +10,9 @@ software_backend: apptainer
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml # not used
+    description: "clustbench on py3.12.3, default python"
     envmodule: na
+    conda: envs/clustbench.yml # not used
     apptainer: envs/clustbench-vanilla.sif
 
   fcps:
diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml
new file mode 100644
index 0000000..99aff2e
--- /dev/null
+++ b/Clustering_apptainer_vanilla_smoketest.yml
@@ -0,0 +1,129 @@
+id: clustering_example_envmodules
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.4
+benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmark_yaml_spec: 0.5
+
+software_backend: apptainer
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.3, default python"
+    envmodule: na
+    conda: envs/clustbench.yml # not used
+    apptainer: envs/clustbench-vanilla.sif
+
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: na
+    conda: envs/fcps.yml # not used
+    apptainer: envs/fcps.sif
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    envmodule: na
+    conda: envs/rmarkdown.yml # not used
+    apptainer: envs/rmarkdown.sif
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"

From 98777a52be5fc9500e715a42dd1f4e146bc467b6 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:21:29 +0200
Subject: [PATCH 42/60] add git in the image

---
 envs/clustbench_apptainer_vanillapy.def | 1 +
 1 file changed, 1 insertion(+)

diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index 5d388bf..63f764a 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -13,6 +13,7 @@ From: ubuntu:noble-20250404
         python3-venv \
         python3-pip \
         ca-certificates \
+        git \
         && apt-get clean && \
         rm -rf /var/lib/apt/lists/*
 

From f4ae29d1600097a42fc906557a085dea97ed8cf0 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:33:56 +0200
Subject: [PATCH 43/60] try to debug fastcluster problem

---
 envs/clustbench_apptainer_optimized.def | 4 ++--
 envs/clustbench_apptainer_vanillapy.def | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index 19726c2..8fc7e08 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -58,7 +58,7 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    pip install -U pip
+    pip install -U pip wheel
 
     pip install \
       "clustering-benchmarks==1.1.6" \
@@ -83,7 +83,7 @@ From: ubuntu:noble-20250404
       "six==1.17.0" \
       "threadpoolctl==3.6.0" \
       "tzdata==2025.2" \
-      "fastcluster==1.2.6" \
+      "fastcluster==1.3.0" \
       "gitpython==3.1.43" \
       "isodate==0.7.2" \
       "pydantic-core==2.34.1"
diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index 63f764a..ff9dd91 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -24,7 +24,7 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    pip install -U pip
+    pip install -U pip wheel
 
     pip install \
       "clustering-benchmarks==1.1.6" \
@@ -49,7 +49,7 @@ From: ubuntu:noble-20250404
       "six==1.17.0" \
       "threadpoolctl==3.6.0" \
       "tzdata==2025.2" \
-      "fastcluster==1.2.6" \
+      "fastcluster==1.3.0" \
       "gitpython==3.1.43" \
       "isodate==0.7.2" \
       "pydantic-core==2.34.1"

From 72cdc598acfd10c2fd73bee49f7b66fdd6a62591 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Wed, 14 May 2025 13:26:23 +0200
Subject: [PATCH 44/60] fail if the exit code fails

---
 .github/workflows/benchmark.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2a55846..e22b368 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -59,7 +59,7 @@ jobs:
 
       - name: Run benchmark
         shell: bash -l {0}
-        continue-on-error: true
+        continue-on-error: false
         run: |
           echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error
 
@@ -98,7 +98,7 @@ jobs:
 
       - name: Deploy to GitHub Pages
         uses: actions/deploy-pages@v4
-          
+
       - name: Create Job Summary
         if: always()
         run: |
@@ -106,4 +106,3 @@ jobs:
           echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY
           echo "### All Outputs" >> $GITHUB_STEP_SUMMARY
           echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY
-    

From 01243de1b555e2e5d4e7b31228d66d8a335edcb3 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Wed, 14 May 2025 13:29:16 +0200
Subject: [PATCH 45/60] use conda short for test

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index e22b368..b6cb977 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -61,7 +61,7 @@ jobs:
         shell: bash -l {0}
         continue-on-error: false
         run: |
-          echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error
+          echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error
 
   upload-artifact:
     name: Benchmark Artifact

From 7b213c9151be563d0fedcbfc9a46e6b10e4e8b2c Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Fri, 16 May 2025 12:28:39 +0200
Subject: [PATCH 46/60] remove rmarkdown dedicated singularity image

---
 Clustering.yaml                            |  3 +-
 Clustering_apptainer_optimized.yml         |  5 +--
 Clustering_apptainer_vanilla.yml           |  5 +--
 Clustering_apptainer_vanilla_smoketest.yml |  3 +-
 Clustering_conda.yml                       |  3 +-
 Clustering_conda_smoketest.yml             |  3 +-
 Clustering_envmodules.yml                  |  3 +-
 Clustering_envmodules_smoketest.yml        |  3 +-
 Clustering_oras.yml                        |  3 +-
 envs/build_singularity.sh                  |  3 +-
 envs/fcps.def                              |  2 +-
 envs/rmarkdown.def                         | 40 ----------------------
 12 files changed, 22 insertions(+), 54 deletions(-)
 delete mode 100644 envs/rmarkdown.def

diff --git a/Clustering.yaml b/Clustering.yaml
index 689be2c..778675e 100644
--- a/Clustering.yaml
+++ b/Clustering.yaml
@@ -56,7 +56,8 @@ stages:
         software_environment: "clustbench"
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1
           # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #  2 1
diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml
index a073683..0a479e5 100644
--- a/Clustering_apptainer_optimized.yml
+++ b/Clustering_apptainer_optimized.yml
@@ -25,7 +25,7 @@ software_environments:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml # not used
     envmodule: na
-    apptainer: envs/rmarkdown.sif
+    apptainer: envs/fcps.sif  # we reuse fcps env
 
 
 metric_collectors:
@@ -50,7 +50,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters:  # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
index 6bc5edd..49e188c 100644
--- a/Clustering_apptainer_vanilla.yml
+++ b/Clustering_apptainer_vanilla.yml
@@ -25,7 +25,7 @@ software_environments:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml # not used
     envmodule: na
-    apptainer: envs/rmarkdown.sif
+    apptainer: envs/fcps.sif  # we reuse fcps env
 
 
 metric_collectors:
@@ -50,7 +50,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters:  # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml
index 99aff2e..0a2139f 100644
--- a/Clustering_apptainer_vanilla_smoketest.yml
+++ b/Clustering_apptainer_vanilla_smoketest.yml
@@ -47,7 +47,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
     outputs:
diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 9e74ee5..17e48f0 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -50,7 +50,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters:  # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml
index 15215d7..9f66440 100644
--- a/Clustering_conda_smoketest.yml
+++ b/Clustering_conda_smoketest.yml
@@ -47,7 +47,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
     outputs:
diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index a2112d4..43c24fa 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -50,7 +50,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: [
diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml
index 3fa8e81..27570bc 100644
--- a/Clustering_envmodules_smoketest.yml
+++ b/Clustering_envmodules_smoketest.yml
@@ -47,7 +47,8 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
     outputs:
diff --git a/Clustering_oras.yml b/Clustering_oras.yml
index c6f0d7e..ff2736b 100644
--- a/Clustering_oras.yml
+++ b/Clustering_oras.yml
@@ -40,7 +40,8 @@ stages:
         software_environment: "clustbench"
         repository:
           url: https://github.com/imallona/clustbench_data
-          commit: 366c5a2
+          commit: 31ac323
+
         parameters:
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 83203c8..a8cd330 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,8 +1,7 @@
 #!/bin/sh
 CMD=singularity
 BUILD='build --fakeroot'
-$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
 # enable this if you want to compare with the custom python compilation
 # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
+$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
 $CMD ${BUILD} fcps.sif fcps.def
-$CMD ${BUILD} rmarkdown.sif rmarkdown.def  # this one is very similar to fcps, remove
diff --git a/envs/fcps.def b/envs/fcps.def
index 922d7f8..a4996d6 100644
--- a/envs/fcps.def
+++ b/envs/fcps.def
@@ -30,7 +30,7 @@ From: rocker/tidyverse:4.4
 
     # Install R packages
     ## FIXME no versioning here
-    Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))'
+    Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "caret", "DataVisualizations", "FCPS", "cclust"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
deleted file mode 100644
index aa20cc1..0000000
--- a/envs/rmarkdown.def
+++ /dev/null
@@ -1,40 +0,0 @@
-Bootstrap: docker
-From: rocker/tidyverse:4.4
-
-# TODO: we could merge this one with fcps.def, no need to duplicate the image.
-
-%labels
-
-    AUTHOR izaskun.mallona@gmail.com
-    AUTHOR ben.uzh@proton.me
-
-%post
-
-    # Install python (3.12 as of noble)
-    export DEBIAN_FRONTEND=noninteractive
-    apt-get update
-    apt-get install -y git \
-        python-is-python3 \
-        python3.12 \
-        python3.12-venv \
-        && apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-
-    # virtualenv
-    cd /opt
-    python3.12 -m venv "default"
-    . default/bin/activate
-
-    pip install \
-        "gitpython==3.1.43" \
-        "isodate==0.7.2" \
-        "pydantic-core==2.34.1"
-
-    # Install R packages
-    Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))'
-
-    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
-
-%environment
-
-    . /opt/default/bin/activate

From ce34ee2ebdef1dd5366a07c5226853d92ed1a084 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Fri, 16 May 2025 13:22:44 +0200
Subject: [PATCH 47/60] remove wget dependency from apptainer, add upload
 script

---
 envs/build_singularity.sh               |  4 ++++
 envs/clustbench_apptainer_optimized.def |  3 ---
 envs/clustbench_apptainer_vanillapy.def |  3 ---
 envs/upload_to_registry.sh              | 12 ++++++++++++
 4 files changed, 16 insertions(+), 6 deletions(-)
 create mode 100644 envs/upload_to_registry.sh

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index a8cd330..430ed30 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,4 +1,8 @@
 #!/bin/sh
+# Builds singularity images.
+# Installation guide: check https://apptainer.org/docs/user/latest/quick_start.html#installation
+# Additionally, you will need:
+# apt install fakeroot uidmap
 CMD=singularity
 BUILD='build --fakeroot'
 # enable this if you want to compare with the custom python compilation
diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index 8fc7e08..846bae3 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -88,9 +88,6 @@ From: ubuntu:noble-20250404
       "isodate==0.7.2" \
       "pydantic-core==2.34.1"
 
-    # TODO: can we use something more maintained?
-    pip install --pre "python3-wget==0.0.2-beta1"
-
     # Do some cleanup to keep the image slim
     rm -rf ~/.cache
     rm -rf ~/src
diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index ff9dd91..a40366c 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -54,9 +54,6 @@ From: ubuntu:noble-20250404
       "isodate==0.7.2" \
       "pydantic-core==2.34.1"
 
-    # TODO: can we use something more maintained?
-    pip install --pre "python3-wget==0.0.2-beta1"
-
     # Do some cleanup to keep the image slim
     rm -rf ~/.cache
 
diff --git a/envs/upload_to_registry.sh b/envs/upload_to_registry.sh
new file mode 100644
index 0000000..7e45e5a
--- /dev/null
+++ b/envs/upload_to_registry.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+USER=user
+REGISTRY=quay.io
+ORGANIZATION=omnibenchmark
+CLUSTBENCH_REPO=clustbench-vanilla
+CLUSTBENCH_TAG=0.1.0
+FCPS_REPO=fcps
+FCPS_TAG=0.1.0
+
+singularity registry login --username {$USER} docker://${REGISTRY}
+singularity push ${CLUSTBENCH_REPO}.sif oras://${REGISTRY}/${ORGANIZATION}/${CLUSTBENCH_REPO}:${CLUSTBENCH_TAG}
+singularity push ${FCPS_REPO}.sif oras://${REGISTRY}/${ORGANIZATION}/${FCPS_REPO}:${FCPS_TAG}

From 0fe20a635adea19663dc81772fd75fe87dc8acdf Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Fri, 16 May 2025 13:23:05 +0200
Subject: [PATCH 48/60] remove python3-wget

---
 envs/README.md     | 9 ++-------
 envs/clustbench.eb | 5 -----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/envs/README.md b/envs/README.md
index 3cab925..4c68a1c 100644
--- a/envs/README.md
+++ b/envs/README.md
@@ -28,18 +28,14 @@ The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks
 - `clustbench_apptainer_optimized.def`
 - `clustbench_apptainer_vanillapy.def`
 - `fcps.def`
-- `rmarkdown.def`
 
 ### How to build
 
 - `make prepare_apptainer_env` from the root folder.
 
-## Aptainer semi-reproducible and remote
-
-TODO: push to the registry (how?)
-
-No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry.
+## Aptainer semi-reproducible with registry pull
 
+No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://quay.io/omnibenchmark registry.
 
 ## envmodules - reproducible builds with easybuild
 
@@ -53,5 +49,4 @@ No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml
 ### How to build
 
 - `make prepare_envmodules_env` from the root folder.
-- `python3-wget` from pypi doesn't look very well maintaned
 
diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index daae6dd..7064c67 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -28,11 +28,6 @@ exts_list = [
     ('hurry.filesize', '0.9', {
         'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'],
     }),
-    ('python3-wget', '0.0.2-beta1', {
-        'modulename': 'wget',
-        'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'],
-        'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'],
-    }),
     ('genieclust', '1.1.6', {
         'download_dep_fail': False,
         'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl',

From d6731437de350627a028310a6ce86e4a8de4916a Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Fri, 16 May 2025 13:32:12 +0200
Subject: [PATCH 49/60] add yaml for registry runs

---
 ...s.yml => Clustering_apptainer_registry.yml |  59 ++++----
 Clustering_apptainer_registry_smoketest.yml   | 133 ++++++++++++++++++
 Clustering_apptainer_vanilla.yml              |   2 +-
 Makefile                                      |   5 +
 envs/README.md                                |   6 +-
 5 files changed, 177 insertions(+), 28 deletions(-)
 rename Clustering_oras.yml => Clustering_apptainer_registry.yml (88%)
 create mode 100644 Clustering_apptainer_registry_smoketest.yml

diff --git a/Clustering_oras.yml b/Clustering_apptainer_registry.yml
similarity index 88%
rename from Clustering_oras.yml
rename to Clustering_apptainer_registry.yml
index ff2736b..7e090e3 100644
--- a/Clustering_oras.yml
+++ b/Clustering_apptainer_registry.yml
@@ -1,48 +1,59 @@
 id: clustering_example_oras
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs.
+
+description: Clustering benchmark on Gagolewski's. Using ORAS registry.
 version: 1.5
 
-benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
 
-#storage: https://play.min.io
-#storage_api: S3
-#storage_bucket_name: clustering_example
-
 software_backend: apptainer
 
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.6"
+    description: "clustbench on py3.12.3, default python"
+    envmodule: na
     conda: envs/clustbench.yml # not used
-    envmodule: clustbench
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
 
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml # not used
     envmodule: na
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
 
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml # not used
     envmodule: na
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
 
 stages:
 
   - id: data
     modules:
       - id: clustbench
-        name: "clustbench datasets"
-        software_environment: "clustbench"
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
 
-        parameters:
+        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
@@ -104,23 +115,20 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] #	5	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] #	4	1  
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] #	4	1
     outputs:
       - id: data.matrix
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
 
-  ## clustbench methods (fastcluster) ###################################################################
-  
   - id: clustering
     modules:
       - id: fastcluster
         name: "fastcluster algorithm"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
-          # url: /home/imallona/src/clustbench_fastcluster/
           commit: "45e43d3"
         parameters:
           - values: ["--linkage", "complete"]
@@ -129,12 +137,12 @@ stages:
           - values: ["--linkage", "weighted"]
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
+
       - id: sklearn
-        name: "sklearn"
-        software_environment: "clustbench"
+        name: sklearn
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
           commit: 5877378
         parameters:
           - values: ["--method", "birch"]
@@ -152,8 +160,8 @@ stages:
           - values: ["--linkage", "complete"]
           - values: ["--linkage", "ward"]
       - id: genieclust
-        name: "genieclust"
-        software_environment: "clustbench"
+        name: genieclust
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
           commit: 6090043
@@ -192,7 +200,7 @@ stages:
     modules:
       - id: partition_metrics
         name: "clustbench partition metrics"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
           commit: 9132d45
@@ -215,4 +223,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
diff --git a/Clustering_apptainer_registry_smoketest.yml b/Clustering_apptainer_registry_smoketest.yml
new file mode 100644
index 0000000..7aae229
--- /dev/null
+++ b/Clustering_apptainer_registry_smoketest.yml
@@ -0,0 +1,133 @@
+id: clustering_example_oras
+
+description: Clustering benchmark on Gagolewski's. Using ORAS registry.
+version: 1.5
+
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
+software_backend: apptainer
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.3, default python"
+    envmodule: na
+    conda: envs/clustbench.yml # not used
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
+
+  fcps:
+    description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
+    envmodule: na
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 31ac323
+
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
index 49e188c..f11b80a 100644
--- a/Clustering_apptainer_vanilla.yml
+++ b/Clustering_apptainer_vanilla.yml
@@ -38,7 +38,7 @@ metric_collectors:
     inputs:
       - metrics.scores
     outputs:
-      - id: plotting.html
+       id: plotting.html
         path: "{input}/{name}/plotting_report.html"
 
 stages:
diff --git a/Makefile b/Makefile
index f342949..27029dd 100644
--- a/Makefile
+++ b/Makefile
@@ -12,6 +12,8 @@ prepare_envmodules_env:
 	cd envs && eb rmarkdown.eb --robot
 
 # short versions, to debug runs & environments
+run_with_apptainer_backend_registry_short:
+	 ${OB_CMD} -b Clustering_apptainer_registry_smoketest.yml
 run_with_apptainer_backend_short:
 	 ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml
 	 mv out out_apptainer_short
@@ -23,6 +25,9 @@ run_with_envmodules_backend_short:
 	 mv out out_lmod_short
 
 # full versions (expect hours)
+run_with_apptainer_backend_registry:
+	 ${OB_CMD} -b Clustering_apptainer_registry.yml
+	 mv out out_apptainer_registry
 run_with_apptainer_backend_vanilla:
 	 ${OB_CMD} -b Clustering_apptainer_vanilla.yml
 	 mv out out_apptainer_vanilla
diff --git a/envs/README.md b/envs/README.md
index 4c68a1c..bb1f174 100644
--- a/envs/README.md
+++ b/envs/README.md
@@ -35,7 +35,11 @@ The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks
 
 ## Aptainer semi-reproducible with registry pull
 
-No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://quay.io/omnibenchmark registry.
+No need to prepare/build anything, since it fetches the apptainer images from a remote registry:
+
+```bash
+ob run benchmark -b Clustering_apptainer_registry.yml --local
+```
 
 ## envmodules - reproducible builds with easybuild
 

From 765e4189449806859182976c09356f32fe856db1 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Fri, 16 May 2025 13:46:58 +0200
Subject: [PATCH 50/60] bump clustering-benchmarks to 1.1.6

---
 envs/clustbench.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/envs/clustbench.yml b/envs/clustbench.yml
index 6cb6201..f894c22 100644
--- a/envs/clustbench.yml
+++ b/envs/clustbench.yml
@@ -6,9 +6,7 @@ dependencies:
   - conda-forge::python=3.12.6
   - conda-forge::pip
   - pip:
-    #- "clustering-benchmarks==1.1.5"
-    - 'https://github.com/gagolews/clustering-benchmarks/releases/download/v1.1.5/clustering_benchmarks-1.1.5.tar.gz'
-    - "wget"
+    - "clustering-benchmarks==1.1.6"
     - "fastcluster==1.2.6"
     - "numpy==1.26.4"
     - "scipy==1.14.1"

From 365f2a0a9f00837b2dab14a0c1853719fed4aad4 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 19 May 2025 13:22:17 +0200
Subject: [PATCH 51/60] templatize

---
 Clustering.yaml                               | 233 ------------------
 Clustering_apptainer.yml                      | 211 ++++++++++++++++
 Clustering_apptainer_optimized.yml            |  37 +--
 ...> Clustering_apptainer_optimized_short.yml |  37 ++-
 Clustering_apptainer_short.yml                | 121 +++++++++
 Clustering_apptainer_vanilla.yml              |  43 ++--
 ... => Clustering_apptainer_vanilla_short.yml |  33 +--
 Clustering_conda.yml                          |  50 ++--
 ...moketest.yml => Clustering_conda_short.yml |  34 +--
 Clustering_envmodules.yml                     | 214 ++++------------
 Clustering_envmodules_short.yml               | 121 +++++++++
 Makefile                                      |  39 ++-
 README.md                                     |   5 +
 Clustering_apptainer_registry.yml => base.yml |  21 +-
 envs/build_singularity.sh                     |   2 +-
 overrides/apptainer.yml                       |   4 +
 overrides/apptainer_optimized.yml             |  12 +
 overrides/apptainer_vanilla.yml               |  12 +
 overrides/conda.yml                           |   4 +
 overrides/envmodules.yml                      |   4 +
 .../base.yml                                  |  19 +-
 21 files changed, 668 insertions(+), 588 deletions(-)
 delete mode 100644 Clustering.yaml
 create mode 100644 Clustering_apptainer.yml
 rename Clustering_apptainer_vanilla_smoketest.yml => Clustering_apptainer_optimized_short.yml (85%)
 create mode 100644 Clustering_apptainer_short.yml
 rename Clustering_envmodules_smoketest.yml => Clustering_apptainer_vanilla_short.yml (88%)
 rename Clustering_apptainer_registry_smoketest.yml => Clustering_conda_short.yml (89%)
 create mode 100644 Clustering_envmodules_short.yml
 rename Clustering_apptainer_registry.yml => base.yml (96%)
 create mode 100644 overrides/apptainer.yml
 create mode 100644 overrides/apptainer_optimized.yml
 create mode 100644 overrides/apptainer_vanilla.yml
 create mode 100644 overrides/conda.yml
 create mode 100644 overrides/envmodules.yml
 rename Clustering_conda_smoketest.yml => smoketest/base.yml (92%)

diff --git a/Clustering.yaml b/Clustering.yaml
deleted file mode 100644
index 778675e..0000000
--- a/Clustering.yaml
+++ /dev/null
@@ -1,233 +0,0 @@
-id: clustering_example
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.2
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-benchmark_yaml_spec: 0.04
-# storage: https://play.min.io
-# storage_api: S3
-# storage_bucket_name: clustering_example
-software_backend: conda
-software_environments:
-  clustbench:
-    description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
-    envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
-  rmarkdown:
-    description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
-  fcps:
-    description: "CRAN's FCPS"
-    conda: envs/fcps.yml
-    apptainer: envs/fcps.sif
-    envmodule: fcps
-metric_collectors:
-  - id: plotting
-    name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
-    repository:
-      url: https://github.com/imallona/clustering_report
-      commit: 1d6bdf5
-    inputs:
-      - metrics.scores
-    outputs:
-      - id: plotting.html
-        path: "{input}/{name}/plotting_report.html"
-
-stages:
-  ## clustbench data ##########################################################
-
-  - id: data
-    modules:
-      - id: clustbench
-        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
-        repository:
-          url: https://github.com/imallona/clustbench_data
-          commit: 31ac323
-
-        parameters: # comments depict the possible cardinalities and the number of curated labelsets
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #  2 1
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #  7 1
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6  2
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #  4 1
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #  2 1
-          # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #  2 1
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #  2 1
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #  2, 4  2
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #  2, 5  2
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5  2
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5  2
-          # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #  3, 5  2
-          # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1
-          # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1
-          # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1
-          # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #  3 1
-          # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1
-          # - values: ["--dataset_generator", "other", "--dataset_name", "square"] #  2 1
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #  7 1
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #  2 2
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #  3, 4  2
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #  8, 9, 15  3
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1
-          # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #  8 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #  2 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #  2 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] #  3 1
-          # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10  1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10  1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6  2
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #  4 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #  3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #  3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #  3 1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] #  4 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] #  3 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] #  5 1
-          # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] #  4 1
-    outputs:
-      - id: data.matrix
-        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
-      - id: data.true_labels
-        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
-  ## clustbench methods (fastcluster) ###################################################################
-  
-  - id: clustering
-    modules:
-      - id: fastcluster
-        name: "fastcluster algorithm"
-        software_environment: "clustbench"
-        repository:
-          url: https://github.com/imallona/clustbench_fastcluster
-          # url: /home/imallona/src/clustbench_fastcluster/
-          commit: "45e43d3"
-        parameters:
-          - values: ["--linkage", "complete"]
-          - values: ["--linkage", "ward"]
-          # - values: ["--linkage", "average"]
-          # - values: ["--linkage", "weighted"]
-          # - values: ["--linkage", "median"]
-          # - values: ["--linkage", "centroid"]
-      - id: sklearn
-        name: "sklearn"
-        software_environment: "clustbench"
-        repository:
-          url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
-          commit: 5877378
-        parameters:
-          - values: ["--method", "birch"]
-          - values: ["--method", "kmeans"]
-          # - values: ["--method", "spectral"] ## too slow
-          # - values: ["--method", "gm"]
-      - id: agglomerative
-        name: "agglomerative"
-        software_environment: "clustbench"
-        repository:
-          url: https://github.com/imallona/clustbench_agglomerative
-          commit: 5454368
-        parameters:
-          # - values: ["--linkage", "average"]
-          - values: ["--linkage", "complete"]
-          - values: ["--linkage", "ward"]
-      - id: genieclust
-        name: "genieclust"
-        software_environment: "clustbench"
-        repository:
-          url: https://github.com/imallona/clustbench_genieclust
-          commit: 6090043
-        parameters:
-          - values: ["--method", "genie", "--gini_threshold", 0.5]
-          - values: ["--method", "gic"]
-          # - values: ["--method", "ica"]
-      - id: fcps
-        name: "fcps"
-        software_environment: "fcps"
-        repository:
-          url: https://github.com/imallona/clustbench_fcps
-          commit: 272fa5f
-        parameters:
-          # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda
-          - values: ["--method", "FCPS_Minimax"]
-          - values: ["--method", "FCPS_MinEnergy"]
-          # - values: ["--method", "FCPS_HDBSCAN_2"]
-          # - values: ["--method", "FCPS_HDBSCAN_4"]
-          # - values: ["--method", "FCPS_HDBSCAN_8"]
-          # - values: ["--method", "FCPS_Diana"]
-          # - values: ["--method", "FCPS_Fanny"]
-          # - values: ["--method", "FCPS_Hardcl"]
-          # - values: ["--method", "FCPS_Softcl"]
-          # - values: ["--method", "FCPS_Clara"]
-          # - values: ["--method", "FCPS_PAM"]
-    inputs:
-      - entries:
-          - data.matrix
-          - data.true_labels
-    outputs:
-      - id: clustering.predicted_ks_range
-        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
-  - id: metrics
-    modules:
-      - id: partition_metrics
-        name: "clustbench partition metrics"
-        software_environment: "clustbench"
-        repository:
-          url: https://github.com/imallona/clustbench_metrics
-          commit: 9132d45
-        parameters:
-          - values: ["--metric", "normalized_clustering_accuracy"]
-          - values: ["--metric", "adjusted_fm_score"]
-          # - values: ["--metric", "adjusted_mi_score"]
-          # - values: ["--metric", "adjusted_rand_score"]
-          # - values: ["--metric", "fm_score"]
-          # - values: ["--metric", "mi_score"]
-          # - values: ["--metric", "normalized_clustering_accuracy"]
-          # - values: ["--metric", "normalized_mi_score"]
-          # - values: ["--metric", "normalized_pivoted_accuracy"]
-          # - values: ["--metric", "pair_sets_index"]
-          # - values: ["--metric", "rand_score"]
-    inputs:
-      - entries:
-          - clustering.predicted_ks_range
-          - data.true_labels
-    outputs:
-      - id: metrics.scores
-        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
diff --git a/Clustering_apptainer.yml b/Clustering_apptainer.yml
new file mode 100644
index 0000000..e075d81
--- /dev/null
+++ b/Clustering_apptainer.yml
@@ -0,0 +1,211 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+software_environments:
+  clustbench:
+    description: "clustbench on py3.12.9, optimized python build"
+    conda: envs/clustbench.yml
+    envmodule: clustbench/0.1.0-foss-2023b
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
+  fcps:
+    description: "CRAN's FCPS"
+    conda: envs/fcps.yml
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 31ac323
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #	7	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] #	3	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] #	2, 6	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #	4	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] #	2, 4, 5	6
+          - values: ["--dataset_generator", "graves", "--dataset_name", "line"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #	2, 4	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #	2, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #	3, 5	2
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #	3	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] #	3	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "square"] #	2	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #	7	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] #	4, 5, 6	5
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #	2	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] #	2	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #	3, 4	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #	8, 9, 15	3
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] #	3	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #	8	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] #	8	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] #	7	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] #	3	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] #	10	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] #	10	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] #	6	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] #	4, 6	2
+          - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] #	4	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+          - values: ["--linkage", "average"]
+          - values: ["--linkage", "weighted"]
+          - values: ["--linkage", "median"]
+          - values: ["--linkage", "centroid"]
+      - id: sklearn
+        name: sklearn
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+          # - values: ["--method", "spectral"] ## too slow
+          - values: ["--method", "gm"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: "clustbench"
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "average"]
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: genieclust
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+          - values: ["--method", "gic"]
+          - values: ["--method", "ica"]
+      - id: fcps
+        name: "fcps"
+        software_environment: "fcps"
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda
+          - values: ["--method", "FCPS_Minimax"]
+          - values: ["--method", "FCPS_MinEnergy"]
+          - values: ["--method", "FCPS_HDBSCAN_2"]
+          - values: ["--method", "FCPS_HDBSCAN_4"]
+          - values: ["--method", "FCPS_HDBSCAN_8"]
+          - values: ["--method", "FCPS_Diana"]
+          - values: ["--method", "FCPS_Fanny"]
+          - values: ["--method", "FCPS_Hardcl"]
+          - values: ["--method", "FCPS_Softcl"]
+          - values: ["--method", "FCPS_Clara"]
+          - values: ["--method", "FCPS_PAM"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 9132d45
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+          - values: ["--metric", "adjusted_mi_score"]
+          - values: ["--metric", "adjusted_rand_score"]
+          - values: ["--metric", "fm_score"]
+          - values: ["--metric", "mi_score"]
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "normalized_mi_score"]
+          - values: ["--metric", "normalized_pivoted_accuracy"]
+          - values: ["--metric", "pair_sets_index"]
+          - values: ["--metric", "rand_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_apptainer_oras
+description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry.
+software_backend: apptainer
diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml
index 0a479e5..d536ddc 100644
--- a/Clustering_apptainer_optimized.yml
+++ b/Clustering_apptainer_optimized.yml
@@ -1,33 +1,23 @@
-id: clustering_example_apptainer_optimized
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-
+# this file has been generated automatically - DO NOT EDIT BY HAND
 version: 1.5
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
-
-software_backend: apptainer
-
 software_environments:
-
   clustbench:
     description: "clustbench on py3.12.9, optimized python build"
-    conda: envs/clustbench.yml # not used
-    envmodule: na
+    conda: envs/clustbench.yml
+    envmodule: clustbench/0.1.0-foss-2023b
     apptainer: envs/clustbench-optimized.sif
-
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml # not used
-    envmodule: na
+    conda: envs/fcps.yml
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
     apptainer: envs/fcps.sif
-
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml # not used
-    envmodule: na
-    apptainer: envs/fcps.sif  # we reuse fcps env
-
-
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: envs/fcps.sif
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -40,9 +30,7 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
-
   - id: data
     modules:
       - id: clustbench
@@ -51,8 +39,7 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
-        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
@@ -120,7 +107,6 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
   - id: clustering
     modules:
       - id: fastcluster
@@ -136,7 +122,6 @@ stages:
           - values: ["--linkage", "weighted"]
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
-
       - id: sklearn
         name: sklearn
         software_environment: clustbench
@@ -194,7 +179,6 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
@@ -222,3 +206,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_apptainer_optimized_local
+description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python)
+software_backend: apptainer
diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_optimized_short.yml
similarity index 85%
rename from Clustering_apptainer_vanilla_smoketest.yml
rename to Clustering_apptainer_optimized_short.yml
index 0a2139f..5bbd791 100644
--- a/Clustering_apptainer_vanilla_smoketest.yml
+++ b/Clustering_apptainer_optimized_short.yml
@@ -1,31 +1,23 @@
-id: clustering_example_envmodules
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
+# this file has been generated automatically - DO NOT EDIT BY HAND
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.5
-
-software_backend: apptainer
-
 software_environments:
-
   clustbench:
     description: "clustbench on py3.12.3, default python"
-    envmodule: na
-    conda: envs/clustbench.yml # not used
-    apptainer: envs/clustbench-vanilla.sif
-
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: envs/clustbench-optimized.sif
   fcps:
     description: "CRAN's FCPS"
-    envmodule: na
-    conda: envs/fcps.yml # not used
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
     apptainer: envs/fcps.sif
-
   rmarkdown:
     description: "R with some plotting dependencies"
-    envmodule: na
-    conda: envs/rmarkdown.yml # not used
-    apptainer: envs/rmarkdown.sif
-
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: envs/fcps.sif
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -38,7 +30,6 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
   - id: data
     modules:
@@ -48,7 +39,6 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
     outputs:
@@ -56,7 +46,6 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
   - id: clustering
     modules:
       - id: fastcluster
@@ -109,7 +98,6 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
@@ -128,3 +116,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_apptainer_optimized_local
+description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python)
+software_backend: apptainer
diff --git a/Clustering_apptainer_short.yml b/Clustering_apptainer_short.yml
new file mode 100644
index 0000000..71bdd6f
--- /dev/null
+++ b/Clustering_apptainer_short.yml
@@ -0,0 +1,121 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.5
+software_environments:
+  clustbench:
+    description: "clustbench on py3.12.3, default python"
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 31ac323
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_apptainer_oras
+description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry.
+software_backend: apptainer
diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
index f11b80a..cd4ba56 100644
--- a/Clustering_apptainer_vanilla.yml
+++ b/Clustering_apptainer_vanilla.yml
@@ -1,33 +1,23 @@
-id: clustering_example_apptainer_vanilla
-
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+# this file has been generated automatically - DO NOT EDIT BY HAND
 version: 1.5
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
-
-software_backend: apptainer
-
 software_environments:
-
   clustbench:
-    description: "clustbench on py3.12.3, default python"
-    envmodule: na
-    conda: envs/clustbench.yml # not used
-    apptainer: envs/clustbench-vanilla.sif
-
+    description: "clustbench on py3.12.9, optimized python build"
+    conda: envs/clustbench.yml
+    envmodule: clustbench/0.1.0-foss-2023b
+    apptainer: envs/clustbench.sif
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml # not used
-    envmodule: na
+    conda: envs/fcps.yml
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
     apptainer: envs/fcps.sif
-
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml # not used
-    envmodule: na
-    apptainer: envs/fcps.sif  # we reuse fcps env
-
-
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: envs/fcps.sif
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -38,11 +28,9 @@ metric_collectors:
     inputs:
       - metrics.scores
     outputs:
-       id: plotting.html
+      - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
-
   - id: data
     modules:
       - id: clustbench
@@ -51,8 +39,7 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
-        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
@@ -120,7 +107,6 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
   - id: clustering
     modules:
       - id: fastcluster
@@ -136,7 +122,6 @@ stages:
           - values: ["--linkage", "weighted"]
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
-
       - id: sklearn
         name: sklearn
         software_environment: clustbench
@@ -194,7 +179,6 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
@@ -222,3 +206,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_apptainer_vanilla_local
+description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image)
+software_backend: apptainer
diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_apptainer_vanilla_short.yml
similarity index 88%
rename from Clustering_envmodules_smoketest.yml
rename to Clustering_apptainer_vanilla_short.yml
index 27570bc..01a1fe2 100644
--- a/Clustering_envmodules_smoketest.yml
+++ b/Clustering_apptainer_vanilla_short.yml
@@ -1,31 +1,23 @@
-id: clustering_example_envmodules
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
+# this file has been generated automatically - DO NOT EDIT BY HAND
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.5
-
-software_backend: envmodules
-
 software_environments:
-
   clustbench:
-    description: "clustbench on py3.12.6"
+    description: "clustbench on py3.12.3, default python"
     envmodule: clustbench/0.1.0-foss-2023b
     conda: envs/clustbench.yml
-    apptainer: na
-
+    apptainer: envs/clustbench.sif
   fcps:
     description: "CRAN's FCPS"
     envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
     conda: envs/fcps.yml
-    apptainer: na
-
+    apptainer: envs/fcps.sif
   rmarkdown:
     description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
     envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
-    conda: envs/clustbench.yml
-    apptainer: na
-
+    apptainer: envs/fcps.sif
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -38,7 +30,6 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
   - id: data
     modules:
@@ -48,7 +39,6 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
     outputs:
@@ -56,9 +46,6 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
-  ## clustbench methods (fastcluster) ###################################################################
-
   - id: clustering
     modules:
       - id: fastcluster
@@ -111,7 +98,6 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
@@ -130,3 +116,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_apptainer_vanilla_local
+description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image)
+software_backend: apptainer
diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 17e48f0..5fd45d2 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -1,32 +1,23 @@
-id: clustering_example_conda
-
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+# this file has been generated automatically - DO NOT EDIT BY HAND
 version: 1.5
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
-
-software_backend: conda
-
 software_environments:
-
   clustbench:
-    description: "clustbench on py3.12.6"
+    description: "clustbench on py3.12.9, optimized python build"
     conda: envs/clustbench.yml
-    envmodule: clustbench
-    apptainer: na
-
+    envmodule: clustbench/0.1.0-foss-2023b
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
-    envmodule: fcps
-    apptainer: na
-
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml
-    envmodule: fcps # not used
-    apptainer: na
-
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -39,10 +30,7 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
-  ## clustbench data ##########################################################
-
   - id: data
     modules:
       - id: clustbench
@@ -51,8 +39,7 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
-        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
@@ -120,17 +107,13 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
-  ## clustbench methods (fastcluster) ###################################################################
-  
   - id: clustering
     modules:
       - id: fastcluster
         name: "fastcluster algorithm"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
-          # url: /home/imallona/src/clustbench_fastcluster/
           commit: "45e43d3"
         parameters:
           - values: ["--linkage", "complete"]
@@ -140,11 +123,10 @@ stages:
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
       - id: sklearn
-        name: "sklearn"
+        name: sklearn
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
           commit: 5877378
         parameters:
           - values: ["--method", "birch"]
@@ -162,8 +144,8 @@ stages:
           - values: ["--linkage", "complete"]
           - values: ["--linkage", "ward"]
       - id: genieclust
-        name: "genieclust"
-        software_environment: "clustbench"
+        name: genieclust
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
           commit: 6090043
@@ -197,12 +179,11 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
         name: "clustbench partition metrics"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
           commit: 9132d45
@@ -225,3 +206,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_conda
+description: Clustering benchmark on Gagolewski's. Using conda.
+software_backend: conda
diff --git a/Clustering_apptainer_registry_smoketest.yml b/Clustering_conda_short.yml
similarity index 89%
rename from Clustering_apptainer_registry_smoketest.yml
rename to Clustering_conda_short.yml
index 7aae229..fd9ae01 100644
--- a/Clustering_apptainer_registry_smoketest.yml
+++ b/Clustering_conda_short.yml
@@ -1,34 +1,23 @@
-id: clustering_example_oras
-
-description: Clustering benchmark on Gagolewski's. Using ORAS registry.
+# this file has been generated automatically - DO NOT EDIT BY HAND
 version: 1.5
-
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
-benchmark_yaml_spec: 0.4
-
-software_backend: apptainer
-
+benchmark_yaml_spec: 0.5
 software_environments:
-
   clustbench:
     description: "clustbench on py3.12.3, default python"
-    envmodule: na
-    conda: envs/clustbench.yml # not used
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
     apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
-
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml # not used
-    envmodule: na
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
     apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
-
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml # not used
-    envmodule: na
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
     apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
-
-
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -41,7 +30,6 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
   - id: data
     modules:
@@ -51,7 +39,6 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
     outputs:
@@ -59,7 +46,6 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
   - id: clustering
     modules:
       - id: fastcluster
@@ -112,7 +98,6 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
@@ -131,3 +116,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_conda
+description: Clustering benchmark on Gagolewski's. Using conda.
+software_backend: conda
diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 43c24fa..805e130 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -1,33 +1,23 @@
-id: clustering_example_envmodules
-
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+# this file has been generated automatically - DO NOT EDIT BY HAND
 version: 1.5
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
-
-software_backend: envmodules
-
 software_environments:
-
   clustbench:
-    description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml # not used
+    description: "clustbench on py3.12.9, optimized python build"
+    conda: envs/clustbench.yml
     envmodule: clustbench/0.1.0-foss-2023b
-    apptainer: na
-
-  rmarkdown:
-    description: "R with some plotting dependencies"
-    conda: envs/rmakrkdown.yml # not used
-    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
-    apptainer: na
-
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml # not used
+    conda: envs/fcps.yml
     envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
-    apptainer: na
-
-
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -40,9 +30,7 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
-
 stages:
-
   - id: data
     modules:
       - id: clustbench
@@ -51,133 +39,42 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 31ac323
-
         parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "fcps",
-                "--dataset_name",
-                "chainlink",
-              ] #	2	1
-          - values: [
-                "--dataset_generator",
-                "fcps",
-                "--dataset_name",
-                "engytime",
-              ] #	2	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
           - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #	7	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] #	3	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] #	2, 6	2
           - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #	4	1
-          - values: [
-                "--dataset_generator",
-                "fcps",
-                "--dataset_name",
-                "twodiamonds",
-              ] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #	2	1
           - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "fuzzyx",
-              ] #	2, 4, 5	6
+          - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] #	2, 4, 5	6
           - values: ["--dataset_generator", "graves", "--dataset_name", "line"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "parabolic",
-              ] #	2, 4	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #	2, 4	2
           - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "ring_noisy",
-              ] #	2	1
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "ring_outliers",
-              ] #	2, 5	2
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "zigzag",
-              ] #	3, 5	2
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "zigzag_noisy",
-              ] #	3, 5	2
-          - values: [
-                "--dataset_generator",
-                "graves",
-                "--dataset_name",
-                "zigzag_outliers",
-              ] #	3, 5	2
-          - values: [
-                "--dataset_generator",
-                "other",
-                "--dataset_name",
-                "chameleon_t4_8k",
-              ] #	6	1
-          - values: [
-                "--dataset_generator",
-                "other",
-                "--dataset_name",
-                "chameleon_t5_8k",
-              ] #	6	1
-          - values: [
-                "--dataset_generator",
-                "other",
-                "--dataset_name",
-                "hdbscan",
-              ] #	6	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #	2, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #	3, 5	2
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] #	6	1
           - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #	3	1
           - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] #	3	1
           - values: ["--dataset_generator", "other", "--dataset_name", "square"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "sipu",
-                "--dataset_name",
-                "aggregation",
-              ] #	7	1
-          - values: [
-                "--dataset_generator",
-                "sipu",
-                "--dataset_name",
-                "compound",
-              ] #	4, 5, 6	5
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #	7	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] #	4, 5, 6	5
           - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #	2	2
           - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "sipu",
-                "--dataset_name",
-                "pathbased",
-              ] #	3, 4	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #	3, 4	2
           - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #	8, 9, 15	3
           - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] #	3	1
-          - values: [
-                "--dataset_generator",
-                "sipu",
-                "--dataset_name",
-                "unbalance",
-              ] #	8	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #	8	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] #	8	1
-          - values: [
-                "--dataset_generator",
-                "uci",
-                "--dataset_name",
-                "ionosphere",
-              ] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #	2	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] #	2	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] #	7	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #	2	1
@@ -186,18 +83,8 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] #	10	1
-          - values: [
-                "--dataset_generator",
-                "wut",
-                "--dataset_name",
-                "isolation",
-              ] #	3	1
-          - values: [
-                "--dataset_generator",
-                "wut",
-                "--dataset_name",
-                "labirynth",
-              ] #	6	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] #	6	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] #	2	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] #	3	1
@@ -205,24 +92,9 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] #	5	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] #	4, 6	2
           - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] #	2	1
-          - values: [
-                "--dataset_generator",
-                "wut",
-                "--dataset_name",
-                "trajectories",
-              ] #	4	1
-          - values: [
-                "--dataset_generator",
-                "wut",
-                "--dataset_name",
-                "trapped_lovers",
-              ] #	3	1
-          - values: [
-                "--dataset_generator",
-                "wut",
-                "--dataset_name",
-                "twosplashes",
-              ] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] #	2	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] #	5	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #	3	1
@@ -235,9 +107,6 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
-
-  ## clustbench methods (fastcluster) ###################################################################
-
   - id: clustering
     modules:
       - id: fastcluster
@@ -245,7 +114,6 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
-          # url: /home/imallona/src/clustbench_fastcluster/
           commit: "45e43d3"
         parameters:
           - values: ["--linkage", "complete"]
@@ -255,7 +123,7 @@ stages:
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
       - id: sklearn
-        name: "sklearn"
+        name: sklearn
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
@@ -267,7 +135,7 @@ stages:
           - values: ["--method", "gm"]
       - id: agglomerative
         name: "agglomerative"
-        software_environment: clustbench
+        software_environment: "clustbench"
         repository:
           url: https://github.com/imallona/clustbench_agglomerative
           commit: 5454368
@@ -276,7 +144,7 @@ stages:
           - values: ["--linkage", "complete"]
           - values: ["--linkage", "ward"]
       - id: genieclust
-        name: "genieclust"
+        name: genieclust
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
@@ -287,7 +155,7 @@ stages:
           - values: ["--method", "ica"]
       - id: fcps
         name: "fcps"
-        software_environment: fcps
+        software_environment: "fcps"
         repository:
           url: https://github.com/imallona/clustbench_fcps
           commit: 272fa5f
@@ -311,7 +179,6 @@ stages:
     outputs:
       - id: clustering.predicted_ks_range
         path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
-
   - id: metrics
     modules:
       - id: partition_metrics
@@ -319,7 +186,7 @@ stages:
         software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
-          commit: 8184cd4
+          commit: 9132d45
         parameters:
           - values: ["--metric", "normalized_clustering_accuracy"]
           - values: ["--metric", "adjusted_fm_score"]
@@ -339,3 +206,6 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_envmodules
+description: Clustering benchmark on Gagolewski's. Using envmodules.
+software_backend: envmodules
diff --git a/Clustering_envmodules_short.yml b/Clustering_envmodules_short.yml
new file mode 100644
index 0000000..e3dc0fd
--- /dev/null
+++ b/Clustering_envmodules_short.yml
@@ -0,0 +1,121 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.5
+software_environments:
+  clustbench:
+    description: "clustbench on py3.12.3, default python"
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 31ac323
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+id: clustering_benchmark_envmodules
+description: Clustering benchmark on Gagolewski's. Using envmodules.
+software_backend: envmodules
diff --git a/Makefile b/Makefile
index 27029dd..16b144a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,40 @@
 MAX_CORES ?= 10
 TIMEOUT ?= 4h
+YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)'
 
 # by default, we want to run all snakemake rules even if there are failures (-k)
 OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES}
 
+APPTR = apptainer
+APPTV = apptainer_vanilla
+APPTO = apptainer_optimized
+CONDA = conda
+ENVMD = envmodules
+
+BASE       = base.yml
+BASE_SHORT = smoketest/base.yml
+
+# Install dependencies to generate files (requires go in the system)
+deps:
+	go install github.com/mikefarah/yq/v4@latest
+
+# Generate all the yaml files from base + overrides
+generate:
+	${YQ_MERGE} overrides/${APPTR}.yml ${BASE} > Clustering_${APPTR}.yml
+	${YQ_MERGE} overrides/${APPTV}.yml ${BASE} > Clustering_${APPTV}.yml
+	${YQ_MERGE} overrides/${APPTO}.yml ${BASE} > Clustering_${APPTO}.yml
+	${YQ_MERGE} overrides/${CONDA}.yml ${BASE} > Clustering_${CONDA}.yml
+	${YQ_MERGE} overrides/${ENVMD}.yml ${BASE} > Clustering_${ENVMD}.yml
+	${YQ_MERGE} overrides/${APPTR}.yml ${BASE_SHORT} > Clustering_${APPTR}_short.yml
+	${YQ_MERGE} overrides/${APPTV}.yml ${BASE_SHORT} > Clustering_${APPTV}_short.yml
+	${YQ_MERGE} overrides/${APPTO}.yml ${BASE_SHORT} > Clustering_${APPTO}_short.yml
+	${YQ_MERGE} overrides/${CONDA}.yml ${BASE_SHORT} > Clustering_${CONDA}_short.yml
+	${YQ_MERGE} overrides/${ENVMD}.yml ${BASE_SHORT} > Clustering_${ENVMD}_short.yml
+
+
+clean:
+	rm Clustering_*.yml
+
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 prepare_envmodules_env:
@@ -13,15 +44,15 @@ prepare_envmodules_env:
 
 # short versions, to debug runs & environments
 run_with_apptainer_backend_registry_short:
-	 ${OB_CMD} -b Clustering_apptainer_registry_smoketest.yml
+	 ${OB_CMD} -b Clustering_registry_short.yml
 run_with_apptainer_backend_short:
-	 ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml
+	 ${OB_CMD} -b Clustering_apptainer_short.yml
 	 mv out out_apptainer_short
 run_with_conda_backend_short:
-	 ${OB_CMD} -b Clustering_conda_smoketest.yml
+	 ${OB_CMD} -b Clustering_conda_short.yml
 	 mv out out_conda_short
 run_with_envmodules_backend_short:
-	 ${OB_CMD} -b Clustering_envmodules_smoketest.yml
+	 ${OB_CMD} -b Clustering_envmodules_short.yml
 	 mv out out_lmod_short
 
 # full versions (expect hours)
diff --git a/README.md b/README.md
index 89d7c05..ad7de1b 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,11 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo
 
 In `envs`: conda, apptainer, easybuild (lmod modules)
 
+# Omnibenchmark YAML generation
+
+The current repo has base templates for different runs.
+Install [yq](https://github.com/mikefarah/yq) and run `make generate` if you want to modify the base template in your tests.
+
 # Warnings
 
 Mind we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters).
diff --git a/Clustering_apptainer_registry.yml b/base.yml
similarity index 96%
rename from Clustering_apptainer_registry.yml
rename to base.yml
index 7e090e3..bfadca6 100644
--- a/Clustering_apptainer_registry.yml
+++ b/base.yml
@@ -1,31 +1,25 @@
-id: clustering_example_oras
-
-description: Clustering benchmark on Gagolewski's. Using ORAS registry.
 version: 1.5
-
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
 
-software_backend: apptainer
-
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.3, default python"
-    envmodule: na
-    conda: envs/clustbench.yml # not used
+    description: "clustbench on py3.12.9, optimized python build"
+    conda: envs/clustbench.yml
+    envmodule: clustbench/0.1.0-foss-2023b
     apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
 
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml # not used
-    envmodule: na
+    conda: envs/fcps.yml
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
     apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
 
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml # not used
-    envmodule: na
+    conda: envs/rmarkdown.yml
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
     apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
 
 
@@ -223,3 +217,4 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
+
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 430ed30..099c4c1 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -7,5 +7,5 @@ CMD=singularity
 BUILD='build --fakeroot'
 # enable this if you want to compare with the custom python compilation
 # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
-$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
+$CMD ${BUILD} clustbench.sif clustbench_apptainer_vanillapy.def
 $CMD ${BUILD} fcps.sif fcps.def
diff --git a/overrides/apptainer.yml b/overrides/apptainer.yml
new file mode 100644
index 0000000..93b6c3e
--- /dev/null
+++ b/overrides/apptainer.yml
@@ -0,0 +1,4 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+id: clustering_benchmark_apptainer_oras
+description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry.
+software_backend: apptainer
diff --git a/overrides/apptainer_optimized.yml b/overrides/apptainer_optimized.yml
new file mode 100644
index 0000000..ae4b5ad
--- /dev/null
+++ b/overrides/apptainer_optimized.yml
@@ -0,0 +1,12 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+id: clustering_benchmark_apptainer_optimized_local
+description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python)
+software_backend: apptainer
+
+software_environments:
+  clustbench:
+    apptainer: envs/clustbench-optimized.sif
+  fcps:
+    apptainer: envs/fcps.sif
+  rmarkdown:
+    apptainer: envs/fcps.sif
diff --git a/overrides/apptainer_vanilla.yml b/overrides/apptainer_vanilla.yml
new file mode 100644
index 0000000..f0d3bc4
--- /dev/null
+++ b/overrides/apptainer_vanilla.yml
@@ -0,0 +1,12 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+id: clustering_benchmark_apptainer_vanilla_local
+description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image)
+software_backend: apptainer
+
+software_environments:
+  clustbench:
+    apptainer: envs/clustbench.sif
+  fcps:
+    apptainer: envs/fcps.sif
+  rmarkdown:
+    apptainer: envs/fcps.sif
diff --git a/overrides/conda.yml b/overrides/conda.yml
new file mode 100644
index 0000000..5f4a1ac
--- /dev/null
+++ b/overrides/conda.yml
@@ -0,0 +1,4 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+id: clustering_benchmark_conda
+description: Clustering benchmark on Gagolewski's. Using conda.
+software_backend: conda
diff --git a/overrides/envmodules.yml b/overrides/envmodules.yml
new file mode 100644
index 0000000..a34d58e
--- /dev/null
+++ b/overrides/envmodules.yml
@@ -0,0 +1,4 @@
+# this file has been generated automatically - DO NOT EDIT BY HAND
+id: clustering_benchmark_envmodules
+description: Clustering benchmark on Gagolewski's. Using envmodules.
+software_backend: envmodules
diff --git a/Clustering_conda_smoketest.yml b/smoketest/base.yml
similarity index 92%
rename from Clustering_conda_smoketest.yml
rename to smoketest/base.yml
index 9f66440..db885fe 100644
--- a/Clustering_conda_smoketest.yml
+++ b/smoketest/base.yml
@@ -1,30 +1,27 @@
-id: clustering_example_envmodules
-description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.5
 
-software_backend: conda
-
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.6"
+    description: "clustbench on py3.12.3, default python"
     envmodule: clustbench/0.1.0-foss-2023b
     conda: envs/clustbench.yml
-    apptainer: na
+    apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0
 
   fcps:
     description: "CRAN's FCPS"
     envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
     conda: envs/fcps.yml
-    apptainer: na
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
 
   rmarkdown:
     description: "R with some plotting dependencies"
-    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
     conda: envs/rmarkdown.yml
-    apptainer: na
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0
+
 
 metric_collectors:
   - id: plotting

From 3a178650d838571af50fc8add21a6b4e4f53ff6e Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 19 May 2025 13:34:17 +0200
Subject: [PATCH 52/60] mv folders to timestamped names

---
 Makefile | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 16b144a..f622128 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ deps:
 	go install github.com/mikefarah/yq/v4@latest
 
 # Generate all the yaml files from base + overrides
+.SILENT: generate
 generate:
 	${YQ_MERGE} overrides/${APPTR}.yml ${BASE} > Clustering_${APPTR}.yml
 	${YQ_MERGE} overrides/${APPTV}.yml ${BASE} > Clustering_${APPTV}.yml
@@ -30,7 +31,9 @@ generate:
 	${YQ_MERGE} overrides/${APPTO}.yml ${BASE_SHORT} > Clustering_${APPTO}_short.yml
 	${YQ_MERGE} overrides/${CONDA}.yml ${BASE_SHORT} > Clustering_${CONDA}_short.yml
 	${YQ_MERGE} overrides/${ENVMD}.yml ${BASE_SHORT} > Clustering_${ENVMD}_short.yml
-
+	echo "[+] The following files have been generated:"
+	ls Clustering_*.yml
+	echo "[+] You can use 'make clean' to delete them"
 
 clean:
 	rm Clustering_*.yml
@@ -43,31 +46,29 @@ prepare_envmodules_env:
 	cd envs && eb rmarkdown.eb --robot
 
 # short versions, to debug runs & environments
-run_with_apptainer_backend_registry_short:
-	 ${OB_CMD} -b Clustering_registry_short.yml
 run_with_apptainer_backend_short:
-	 ${OB_CMD} -b Clustering_apptainer_short.yml
-	 mv out out_apptainer_short
+	 ${OB_CMD} -b Clustering_${APPTR}_short.yml
+	 mv out out_${APPTR}_short-$(shell date +'%Y%m%d%H%M')
+run_with_apptainer_backend_vanilla_short:
+	 ${OB_CMD} -b Clustering_${APPTV}_short.yml
+	 mv out out_${APPTV}_short-$(shell date +'%Y%m%d%H%M')
 run_with_conda_backend_short:
-	 ${OB_CMD} -b Clustering_conda_short.yml
-	 mv out out_conda_short
+	 ${OB_CMD} -b Clustering_${CONDA}_short.yml
+	 mv out out_${CONDA}_short-$(shell date +'%Y%m%d%H%M')
 run_with_envmodules_backend_short:
-	 ${OB_CMD} -b Clustering_envmodules_short.yml
-	 mv out out_lmod_short
+	 ${OB_CMD} -b Clustering_${ENVMD}.yml
+	 mv out out_${ENVMD}_short-$(shell date +'%Y%m%d%H%M')
 
 # full versions (expect hours)
-run_with_apptainer_backend_registry:
-	 ${OB_CMD} -b Clustering_apptainer_registry.yml
-	 mv out out_apptainer_registry
+run_with_apptainer_backend:
+	 ${OB_CMD} -b Clustering_${APPTR}.yml
+	 mv out out_${APPTR}-$(shell date +'%Y%m%d%H%M')
 run_with_apptainer_backend_vanilla:
-	 ${OB_CMD} -b Clustering_apptainer_vanilla.yml
-	 mv out out_apptainer_vanilla
-run_with_apptainer_backend_optimized:
-	 ${OB_CMD} -b Clustering_apptainer_optimized.yml
-	 mv out out_apptainer_vanilla
+	 ${OB_CMD} -b Clustering_${APPTV}.yml
+	 mv out out_${APPTV}-$(shell date +'%Y%m%d%H%M')
 run_with_conda_backend:
-	 ${OB_CMD} -b Clustering_conda.yml
-	 mv out out_conda
+	 ${OB_CMD} -b Clustering_${CONDA}.yml
+	 mv out out_${CONDA}-$(shell date +'%Y%m%d%H%M')
 run_with_envmodules_backend:
-	 ${OB_CMD} -b Clustering_envmodules.yml
-	 mv out out_lmod
+	 ${OB_CMD} -b Clustering_${ENVMD}.yml
+	 mv out out_${ENVMD}-$(shell date +'%Y%m%d%H%M')

From f6caebd9d84d8dbe92e792d60cfe4c5bf0f83849 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 19 May 2025 13:51:14 +0200
Subject: [PATCH 53/60] add --yes flag

---
 .github/workflows/benchmark.yml | 2 +-
 Makefile                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index b6cb977..7e2a6fc 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -61,7 +61,7 @@ jobs:
         shell: bash -l {0}
         continue-on-error: false
         run: |
-          echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error
+          ob run benchmark -b Clustering_conda_short.yml --local --cores 3 --continue-on-error --yes
 
   upload-artifact:
     name: Benchmark Artifact
diff --git a/Makefile b/Makefile
index f622128..895285a 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ TIMEOUT ?= 4h
 YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)'
 
 # by default, we want to run all snakemake rules even if there are failures (-k)
-OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES}
+OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} --yes
 
 APPTR = apptainer
 APPTV = apptainer_vanilla

From f72d0cae0f79adb35930922d809407d243aa947f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 19 May 2025 14:00:25 +0200
Subject: [PATCH 54/60] update readmes

---
 README.md      |  2 +-
 envs/README.md | 51 +++++++++++++++++++++++++-------------------------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index ad7de1b..653de7f 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ A clustering example for omnibenchmark
 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/)
 2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example`
 3. Move into the cloned folder: `cd clustering_example`
-4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML  --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
+4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML  --local --cores 6`. Choose your `Clustering_*.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
 
 # Clustbench attribution
 
diff --git a/envs/README.md b/envs/README.md
index bb1f174..3e1f1e3 100644
--- a/envs/README.md
+++ b/envs/README.md
@@ -1,56 +1,57 @@
 We distribute `Clustering.yml` runs with different backends.
 
-- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip)
-- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files.
-- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry.
 - `Clustering_envmodules.yml`. Easybuild backend with default optimization.
+- `Clustering_apptainer.yml`. Apptainer, pinned, prebuilt remote images from [omnibenchmark's registry](https://quay.io/organization/omnibenchmark).
+- `Clustering_apptainer_vanilla.yml`. Singularity, pinnned, from local SIF images.
+- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip)
 
-## Conda
+## envmodules - reproducible builds with easybuild
 
 ### Files
 
-- `clustbench.yml`
-- `fcps.yml`
-- `rmarkdown.yml`
+- `clustbench.eb`
+- `fcps.eb`
+- `rmarkdown.eb`
+- `rmarkdown-python.eb`
 
 ### How to build
 
-No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clustering_conda.yml --local` do it.
+- `make prepare_envmodules_env` from the root folder.
+
+## Aptainer, pinned, with registry pull
+
+No need to prepare/build anything, since it fetches the apptainer images from a remote registry"
+
+```bash
+make run_with_apptainer_backend
+```
 
-## Apptainer semi-reproducible and local
+## Apptainer, pinned, local build
 
 ### Files
 
 The apptainer images are based in ubuntu-noble docker images.
 
-The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image.
+The "optimized" flavor does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image.
 
 - `clustbench_apptainer_optimized.def`
 - `clustbench_apptainer_vanillapy.def`
 - `fcps.def`
 
-### How to build
+### How to build the SIF images
 
 - `make prepare_apptainer_env` from the root folder.
 
-## Aptainer semi-reproducible with registry pull
-
-No need to prepare/build anything, since it fetches the apptainer images from a remote registry:
-
-```bash
-ob run benchmark -b Clustering_apptainer_registry.yml --local
-```
-
-## envmodules - reproducible builds with easybuild
+## Conda
 
 ### Files
 
-- `clustbench.eb`
-- `fcps.eb`
-- `rmarkdown.eb`
-- `rmarkdown-python.eb`
+- `clustbench.yml`
+- `fcps.yml`
+- `rmarkdown.yml`
 
 ### How to build
 
-- `make prepare_envmodules_env` from the root folder.
+No need to `ob software conda pin / prepare`. Just use `ob run benchmark -b Clustering_conda.yml --local`.
+
 

From eef58c4c54de53eeb1e7210800e1bd77359c8e2b Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 19 May 2025 15:50:56 +0200
Subject: [PATCH 55/60] extract modules with yq

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 895285a..5477572 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
 MAX_CORES ?= 10
 TIMEOUT ?= 4h
 YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)'
+YQ_REPOS=yq '.stages[].modules[] | .id + ": " + .repository.url + "@" + .repository.commit'
 
 # by default, we want to run all snakemake rules even if there are failures (-k)
 OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} --yes
@@ -72,3 +73,6 @@ run_with_conda_backend:
 run_with_envmodules_backend:
 	 ${OB_CMD} -b Clustering_${ENVMD}.yml
 	 mv out out_${ENVMD}-$(shell date +'%Y%m%d%H%M')
+
+extract_modules:
+	@${YQ_REPOS} base.yml

From f076b837cb76d6ce076e67c5c554aa8418b4c845 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Tue, 20 May 2025 13:51:26 +0200
Subject: [PATCH 56/60] envmodules short

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5477572..71b6860 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ run_with_conda_backend_short:
 	 ${OB_CMD} -b Clustering_${CONDA}_short.yml
 	 mv out out_${CONDA}_short-$(shell date +'%Y%m%d%H%M')
 run_with_envmodules_backend_short:
-	 ${OB_CMD} -b Clustering_${ENVMD}.yml
+	 ${OB_CMD} -b Clustering_${ENVMD}_short.yml
 	 mv out out_${ENVMD}_short-$(shell date +'%Y%m%d%H%M')
 
 # full versions (expect hours)

From 478ef276c61056ecba3d341093d4b1e733010aa6 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Tue, 20 May 2025 13:55:45 +0200
Subject: [PATCH 57/60] bump version in README

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 653de7f..e772f13 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,12 @@ A clustering example for omnibenchmark
 
 # How to run
 
-1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/)
-2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example`
+1. Install omnibenchmark: `pip install omnibenchmark>=0.2.0`
+2. Clone the benchmark definition in this repository with `git clone https://github.com/omnibenchmark/clustering_example`
 3. Move into the cloned folder: `cd clustering_example`
-4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML  --local --cores 6`. Choose your `Clustering_*.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
+4. Run locally, with the desired degree of parallelism:
+   `ob run benchmark -b <Clustering_flavor.yaml> --local --cores 6`.
+   Choose your `Clustering_*.yml` specification based on the backend you want to run (conda, easybuild or apptainer). [More details about the available backends and how to build or enable them](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
 
 # Clustbench attribution
 

From 0896ffab1af42bce0b923b95f3ab4bb15db43f3d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Tue, 20 May 2025 16:21:15 +0200
Subject: [PATCH 58/60] track main branch

---
 .github/workflows/benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 7e2a6fc..cb744c4 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -48,14 +48,14 @@ jobs:
         shell: bash -l {0}
         run: |
           mamba install -y pip
-          pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev
+          pip install git+https://github.com/omnibenchmark/omnibenchmark.git@main
 
       - name: Load benchmark cache
         id: cache-benchmark
         uses: actions/cache@v3
         with:
           path: out/
-          key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering.yaml') }}
+          key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering_conda_short.yml') }}
 
       - name: Run benchmark
         shell: bash -l {0}

From 1c6179fb6c26c7864435e53ed5f8ccd507a1bd42 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Tue, 20 May 2025 16:27:36 +0200
Subject: [PATCH 59/60] blah blah

---
 README.md | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index e772f13..7859113 100644
--- a/README.md
+++ b/README.md
@@ -9,24 +9,23 @@ A clustering example for omnibenchmark
    `ob run benchmark -b <Clustering_flavor.yaml> --local --cores 6`.
    Choose your `Clustering_*.yml` specification based on the backend you want to run (conda, easybuild or apptainer). [More details about the available backends and how to build or enable them](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
 
-# Clustbench attribution
-
-by Marek Gagolewski, modified by Izaskun Mallona
-
-# Data disclaimer
-
-Some datasets are commented out to speed up calculations.
+# Software backends and variants
 
-From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082):
+* All needed recipes can be found under `envs`: conda, apptainer, easybuild (lmod modules)
+* The `_smoketest` variants are meant for [quick testing](https://en.wikipedia.org/wiki/Smoke_testing_(software))
+* The default `apptainer` container fetches images from an online registry.
+* `apptainer-vanilla` makes reference to a container image with stock python (`3.12`)
+* `apptainer-optimized` makes reference to a container image with a custom compiled python (`3.12.9`), just to check if optimization flags have a noticeable effect.
+* `envmodules` will need you to previously build the `.eb` easyconfigs with easybuild. We plan to make these modules publicly available in the future.
+* `conda` environments will fetch software from the configured conda channels and pypi. Does not compile anything, fetches pre-built binaries (assuming there's a build in those channels for your architecture, that is)
 
-> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1.
+[More info in the envs/ folder](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
 
-A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h.
 
 # Summary
 
 - Data. Example datasets (not a comprehensive list, it's >79 of them):
-  - https://github.com/imallona/clustbench_data 
+  - https://github.com/imallona/clustbench_data
     - args: ["--dataset_generator", "mnist", "--dataset_name", "fashion"]
     - args: ["--dataset_generator", "other", "--dataset_name", "iris"]
     - args: ["--dataset_generator", "mnist", "--dataset_name", "digits"]
@@ -39,7 +38,7 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo
     - args: ["--linkage", "weighted"]
     - args: ["--linkage", "median"]
     - args: ["--linkage", "centroid"]
-  - https://github.com/imallona/clustbench_sklearn 
+  - https://github.com/imallona/clustbench_sklearn
     - args: ["--method", "birch"]
     - args: ["--method", "kmeans"]
     - args: ["--method", "spectral"] ## too slow
@@ -86,11 +85,7 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo
   - https://github.com/omnibenchmark-example/ward.git
   - https://github.com/omnibenchmark-example/ari.git
   - https://github.com/omnibenchmark-example/accuracy.git
-  
-  
-# Software backends
 
-In `envs`: conda, apptainer, easybuild (lmod modules)
 
 # Omnibenchmark YAML generation
 
@@ -102,3 +97,17 @@ Install [yq](https://github.com/mikefarah/yq) and run `make generate` if you wan
 Mind we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters).
 
 Also, we have modules by Daniel not fully incorporated into Gagolewski's flow.
+
+# Data disclaimer
+
+Some datasets are commented out to speed up calculations.
+
+From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082):
+
+> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1.
+
+A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h.
+
+# Clustbench attribution
+
+by Marek Gagolewski, modified by Izaskun Mallona

From b64949e7a36f00d9430ef3ddd526dec08681fc49 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Tue, 20 May 2025 16:31:53 +0200
Subject: [PATCH 60/60] add comment

---
 envs/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/envs/README.md b/envs/README.md
index 3e1f1e3..335a9d7 100644
--- a/envs/README.md
+++ b/envs/README.md
@@ -3,8 +3,12 @@ We distribute `Clustering.yml` runs with different backends.
 - `Clustering_envmodules.yml`. Easybuild backend with default optimization.
 - `Clustering_apptainer.yml`. Apptainer, pinned, prebuilt remote images from [omnibenchmark's registry](https://quay.io/organization/omnibenchmark).
 - `Clustering_apptainer_vanilla.yml`. Singularity, pinnned, from local SIF images.
+- `Clustering_apptainer_optimized.yml`. Singularity, pinnned, from local SIF images. This image compiles a custom python with optimization flags.
 - `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip)
 
+The `_short` variants are meant to run smoketests and see that there's no operational problems when running the environments, abnormal terminations etc.
+
+
 ## envmodules - reproducible builds with easybuild
 
 ### Files