From 4a3fdc3941a2b097bb5c97889f7dd0bc657e342d Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:18:01 +0200 Subject: [PATCH 01/60] run dev branch --- .github/workflows/benchmark.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c1a1e82..500eb58 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,7 +18,6 @@ jobs: run-benchmark: name: Run Benchmark runs-on: ubuntu-latest - ## runs-on: self-hosted steps: - name: Check out repository uses: actions/checkout@v4 @@ -49,7 +48,7 @@ jobs: shell: bash -l {0} run: | mamba install -y pip - pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope + pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev - name: Load benchmark cache id: cache-benchmark @@ -67,7 +66,6 @@ jobs: upload-artifact: name: Benchmark Artifact runs-on: ubuntu-latest - ## runs-on: self-hosted needs: run-benchmark if: always() steps: From e89adda93e7fdc64b52b2e77dc702969a50f735c Mon Sep 17 00:00:00 2001 From: btraven <128150520+btraven00@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:20:31 +0100 Subject: [PATCH 02/60] docs: use the public repo URI --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a75c594..89d7c05 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ A clustering example for omnibenchmark # How to run 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/) -2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git` -3. Move to the cloned repository `cd clustering_example` +2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example` +3. Move into the cloned folder: `cd clustering_example` 4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). # Clustbench attribution From 52ebb556eae88f36d2e857aadfe8189c4aca3eaf Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 13:02:37 +0100 Subject: [PATCH 03/60] chore: add convenience target to build singularity env - make script executable - use /bin/sh instead of /bin/bash - add top-level Makefile to prepare env --- Makefile | 2 ++ envs/build_singularity.sh | 14 +++++--------- 2 files changed, 7 insertions(+), 9 deletions(-) create mode 100644 Makefile mode change 100644 => 100755 envs/build_singularity.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1e56cb2 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +prepare_apptainer_env: + cd envs && ./build_singularity.sh diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh old mode 100644 new mode 100755 index 86e053f..c0c3d93 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,9 +1,5 @@ -#!/bin/bash - -sudo singularity build sklearn.sif sklearn_singularity.def - -sudo singularity build clustbench.sif clustbench_singularity.def - -sudo singularity build r.sif r_singularity.def - -sudo singularity build fcps.sif fcps_singularity.def +#!/bin/sh +singularity build sklearn.sif sklearn_singularity.def +singularity build clustbench.sif clustbench_singularity.def +singularity build r.sif r_singularity.def +singularity build fcps.sif fcps_singularity.def From 83c6f0b0c78851d93be5956fd27a8180c61b2ba7 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 13:59:05 +0100 Subject: [PATCH 04/60] feat: parametrize num threads on the makefile --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index 1e56cb2..3c58e2f 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,10 @@ +MAX_THREADS ?= 30 +OB_CMD="ob run benchmark -k --local" prepare_apptainer_env: cd envs && ./build_singularity.sh +run_with_apptainer_backend: + ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS} + mv out out_apptainer +run_with_conda_backend: + ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS} + mv out out_conda From dc2d629004fcdb40f75bc24194287b961eb40283 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 14:02:21 +0100 Subject: [PATCH 05/60] chore: ignore common temporary outputs and image build artifacts --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d38534 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# image build artifacts +envs/*.sif + +# snakemake +snakemake.log +.snakemake/ + +# vim swaps +*.swp +*.swo From f91603aecf8f82975087c89615d3473d4b79c12f Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 13:59:05 +0100 Subject: [PATCH 06/60] feat: parametrize num threads on the makefile --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3c58e2f..6883fa0 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ MAX_THREADS ?= 30 -OB_CMD="ob run benchmark -k --local" +# by default, we want to run all snakemake rules even if there are failures +OB_CMD=ob run benchmark -k --local prepare_apptainer_env: cd envs && ./build_singularity.sh run_with_apptainer_backend: From bea2a75173f9c19edb2adb1c22bc1ab90d62774d Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 5 May 2025 10:07:36 +0200 Subject: [PATCH 07/60] fix: use --cores, --task-timeout --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 6883fa0..73b33b5 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ -MAX_THREADS ?= 30 +MAX_CORES ?= 10 # by default, we want to run all snakemake rules even if there are failures -OB_CMD=ob run benchmark -k --local +OB_CMD=ob run benchmark -k --local --task-timeout "4h" prepare_apptainer_env: cd envs && ./build_singularity.sh run_with_apptainer_backend: - ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS} + ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES} mv out out_apptainer run_with_conda_backend: - ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS} + ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES} mv out out_conda From 67e8cf8bc7e0deab9f6bfdc5aceaffe39841040e Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 7 May 2025 21:41:46 +0200 Subject: [PATCH 08/60] update .eb files to easybuild 5.0 --- Makefile | 9 ++++-- envs/clustbench.eb | 81 ++++++---------------------------------------- envs/fcps.eb | 18 ++++------- 3 files changed, 23 insertions(+), 85 deletions(-) diff --git a/Makefile b/Makefile index 73b33b5..e107f62 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,14 @@ MAX_CORES ?= 10 # by default, we want to run all snakemake rules even if there are failures -OB_CMD=ob run benchmark -k --local --task-timeout "4h" +OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES} prepare_apptainer_env: cd envs && ./build_singularity.sh +prepare_envmodules_env: + cd envs && eb clustbench.eb --robot + cd envs && eb fcps.eb --robot run_with_apptainer_backend: - ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES} + ${OB_CMD} -b Clustering_singularity.yml mv out out_apptainer run_with_conda_backend: - ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES} + ${OB_CMD} -b Clustering_conda.yml mv out out_conda diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 22597fb..f3ee681 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -1,108 +1,47 @@ -## largely as https://github.com/easybuilders/easybuild-easyconfigs/blob/949c266db9e17440ec2829eb8ffdbdb87ceaf543/easybuild/easyconfigs/c/cooler/cooler-0.10.2-foss-2023b.eb#L4 - easyblock = 'PythonBundle' name = 'clustbench' -version = '1' +version = '0.1.0' -homepage = 'https://python.org/' +homepage = 'https://omnibenchmark.org' description = "Bundle of Python packages for ob clustering_example" toolchain = {'name': 'foss', 'version': '2023b'} - dependencies = [ ('Python', '3.11.5'), - ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ('SciPy-bundle', '2023.11'), - ('meson-python', '0.15.0'), ('matplotlib', '3.8.2'), - ('scikit-learn', '1.4.0') - + ('scikit-learn', '1.4.0'), +# ('meson-python', '0.15.0'), +# ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ] -sanity_pip_check = True -use_pip = True - -exts_default_options = { - 'sanity_pip_check': True, - 'use_pip' : True -} - -## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz -## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip -## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz -## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz -## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - -source_urls = [PYPI_SOURCE, - 'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/', - 'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/', - 'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/', - 'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/', - 'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa', - 'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/', - 'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/', - 'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/', - 'https://github.com/pybind/pybind11/archive/', - 'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/', - 'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz', - 'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz', - 'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz', - 'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz', - 'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz', - 'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz', - 'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz'] - - -## caution download genieclust here, not pypi, they differ and pypi's it's not installable! -## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/ -## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz -## todo automate this within the easyconfig! - exts_list = [ ('natsort', '8.4.0', { 'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'], }), - ('cython', '3.0.11', { - 'checksums': ['7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff'], - }), ('hypothesis', '6.124.7', { 'checksums': ['8ed6c6ae47e7d26d869c1dc3dee04e8fc50c95240715bb9915ded88d6d920f0e'], }), - ('numpy', '1.26.4', { - 'checksums': ['2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010'], - }), ('fastcluster', '1.2.6', { 'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'], }), - ('genieclust', '1.1.6', { - 'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'], - }), ('hurry.filesize', '0.9', { 'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'], }), ('python3-wget', '0.0.2-beta1', { 'modulename': 'wget', + 'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'], 'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'], }), - ('clustering_benchmarks', '1.1.5', { - 'modulename': 'clustbench', - 'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'], + ('genieclust', '1.1.6', { + 'download_dep_fail': False, + 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', + 'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'], }), ] -sanity_check_paths = { - 'files': [], - 'dirs': ['lib/python3.11/site-packages/clustbench/'] -} - moduleclass = 'bio' diff --git a/envs/fcps.eb b/envs/fcps.eb index ee3db52..54c8c7d 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -1,6 +1,3 @@ -## largely as in https://github.com/easybuilders/easybuild-easyconfigs/commit/e9a36171c68414f933ab1afa03b32422491f0f96#diff-3f2a92ab6ab59ddaccf4bc61b59bdd3f6717b95fd019131a57f51eefc831a699 -## Caution boost easyconfig needs update https://raw.githubusercontent.com/easybuilders/easybuild-easyconfigs/refs/heads/develop/easybuild/easyconfigs/b/Boost/Boost-1.82.0-GCC-12.3.0.eb (https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2) - easyblock = 'Bundle' name = 'fcps' @@ -23,10 +20,7 @@ dependencies = [ exts_default_options = { 'source_urls': [ - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/', - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/Archive/%(name)s', - 'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/', - 'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/', + 'https://bioconductor.org/packages/release/bioc/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive 'https://cran.r-project.org/src/contrib/', # current version of packages 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages @@ -192,13 +186,15 @@ exts_list = [ ('cluster', '2.1.8', { 'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'], }), - ('graph', '1.84.1', { - 'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'], + ('graph', '1.86.0', { + 'checksums': ['ac9e196dfcb43848a851ea2d339cff41f8f16c7e80e76282c8fe7b822df8f367'], }), ('mclust', '6.1.1', { 'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'], }), - ('cclust', '0.6-26'), + ('cclust', '0.6-26', { + 'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'], + }), ('flowClust', '3.40.0', { 'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'", 'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'], @@ -240,4 +236,4 @@ sanity_check_paths = { 'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'], } -moduleclass = 'bio' \ No newline at end of file +moduleclass = 'bio' From 931389f796ef8ceb7e4951c80c708e1b2c2129b1 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:38:02 +0200 Subject: [PATCH 09/60] remove remote storage --- Clustering.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Clustering.yaml b/Clustering.yaml index 0007ea5..689be2c 100644 --- a/Clustering.yaml +++ b/Clustering.yaml @@ -2,10 +2,10 @@ id: clustering_example description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.2 benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example +# storage: https://play.min.io +# storage_api: S3 +# storage_bucket_name: clustering_example software_backend: conda software_environments: clustbench: From 60ac47b3c55bec65b5ad839d524a7b8cd87b1b4c Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:41:44 +0200 Subject: [PATCH 10/60] do not run artifact if not in main repo --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 500eb58..2a55846 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -67,7 +67,7 @@ jobs: name: Benchmark Artifact runs-on: ubuntu-latest needs: run-benchmark - if: always() + if: github.ref == 'refs/heads/main' && github.repository_owner == 'omnibenchmark' steps: - name: Check out repository uses: actions/checkout@v4 From 1b972bfef0d7a74199d0289d8b7b8749720bce27 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:45:12 +0200 Subject: [PATCH 11/60] Update Makefile --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e107f62..875a375 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ MAX_CORES ?= 10 -# by default, we want to run all snakemake rules even if there are failures +# by default, we want to run all snakemake rules even if there are failures (-k) OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES} prepare_apptainer_env: cd envs && ./build_singularity.sh @@ -12,3 +12,6 @@ run_with_apptainer_backend: run_with_conda_backend: ${OB_CMD} -b Clustering_conda.yml mv out out_conda +run_with_envmodules_backend: + ${OB_CMD} -b Clustering_envmodules.yml + mv out out_lmod From 49646db648dee014b3a43f655ef64147cbda6ed0 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 13:22:56 +0200 Subject: [PATCH 12/60] streamline envmodules yaml --- Clustering_envmodules.yml | 281 ++++++++++++++++++++------------------ 1 file changed, 149 insertions(+), 132 deletions(-) diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 3c2b8bd..1ab4808 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -2,32 +2,21 @@ id: clustering_example_envmodules description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.4 benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleenvmodules +benchmark_yaml_spec: 0.5 + software_backend: envmodules + software_environments: clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho + conda: na + apptainer: na rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: rmarkdown # TODO + conda: na + apptainer: na fcps: description: "CRAN's FCPS" conda: envs/fcps.yml @@ -56,42 +45,132 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "chainlink", + ] # 2 1 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "engytime", + ] # 2 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "twodiamonds", + ] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "fuzzyx", + ] # 2, 4, 5 6 - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "parabolic", + ] # 2, 4 2 - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "ring_noisy", + ] # 2 1 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "ring_outliers", + ] # 2, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag_noisy", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag_outliers", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "chameleon_t4_8k", + ] # 6 1 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "chameleon_t5_8k", + ] # 6 1 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "hdbscan", + ] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "aggregation", + ] # 7 1 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "compound", + ] # 4, 5, 6 5 - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "pathbased", + ] # 3, 4 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "unbalance", + ] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: [ + "--dataset_generator", + "uci", + "--dataset_name", + "ionosphere", + ] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 @@ -100,8 +179,18 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "isolation", + ] # 3 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "labirynth", + ] # 6 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 @@ -109,9 +198,24 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "trajectories", + ] # 4 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "trapped_lovers", + ] # 3 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "twosplashes", + ] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 @@ -126,7 +230,7 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" ## clustbench methods (fastcluster) ################################################################### - + - id: clustering modules: - id: fastcluster @@ -148,7 +252,6 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -229,89 +332,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" From fc53991d1eb32c7749c3f1a2bccc0ed9e33601af Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 15:14:35 +0200 Subject: [PATCH 13/60] update clustbench --- Clustering_envmodules.yml | 38 ++++++++++++++++++++------------------ envs/clustbench.eb | 5 +++++ 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 1ab4808..f37fd6c 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -9,23 +9,24 @@ software_backend: envmodules software_environments: clustbench: description: "clustbench on py3.12.6" - envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho - conda: na + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml apptainer: na rmarkdown: description: "R with some plotting dependencies" envmodule: rmarkdown # TODO - conda: na + conda: envs/clustbench.yml apptainer: na - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps + metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -34,14 +35,15 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data + ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -235,7 +237,7 @@ stages: modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster # url: /home/imallona/src/clustbench_fastcluster/ @@ -249,7 +251,7 @@ stages: - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn commit: 5877378 @@ -260,7 +262,7 @@ stages: - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_agglomerative commit: 5454368 @@ -270,7 +272,7 @@ stages: - values: ["--linkage", "ward"] - id: genieclust name: "genieclust" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -280,7 +282,7 @@ stages: - values: ["--method", "ica"] - id: fcps name: "fcps" - software_environment: "fcps" + software_environment: fcps repository: url: https://github.com/imallona/clustbench_fcps commit: 272fa5f @@ -309,10 +311,10 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 + commit: 8184cd4 parameters: - values: ["--metric", "normalized_clustering_accuracy"] - values: ["--metric", "adjusted_fm_score"] diff --git a/envs/clustbench.eb b/envs/clustbench.eb index f3ee681..0e86911 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -13,6 +13,7 @@ dependencies = [ ('SciPy-bundle', '2023.11'), ('matplotlib', '3.8.2'), ('scikit-learn', '1.4.0'), +# FIXME: I think this is not needed -- ben # ('meson-python', '0.15.0'), # ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ] @@ -40,6 +41,10 @@ exts_list = [ 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', 'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'], }), + ('clustering_benchmarks', '1.1.6', { + 'modulename': 'clustbench', + 'checksums': ['8c3ac0aed7c4c4925df6e5000db29aed6359341bd1ef2e516f230e13d8b66a0c'], + }), ] moduleclass = 'bio' From 54b72790b1e2d2d9aa66d30c3d956b5d8be387a3 Mon Sep 17 00:00:00 2001 From: ben Date: Sat, 10 May 2025 18:46:41 +0200 Subject: [PATCH 14/60] add rmarkdown-python bundles, without checksums --- envs/rmarkdown-python.eb | 28 ++++++++++++ envs/rmarkdown.eb | 94 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 envs/rmarkdown-python.eb create mode 100644 envs/rmarkdown.eb diff --git a/envs/rmarkdown-python.eb b/envs/rmarkdown-python.eb new file mode 100644 index 0000000..a9edb00 --- /dev/null +++ b/envs/rmarkdown-python.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +# This is a dummy bundle that installs: +# 1. rmarkdown: an R bundle that we also package +# 2. Python-3.12.3-GCCcore-13.3.0 +# This is a dependency for the clustering_benchmark metric collector. + +name = 'rmarkdown-python' +version = '0.1.0' + +local_rver = '4.4.2' +local_pyver = '3.12.3' +versionsuffix = f'-r-{local_rver}-py-{local_pyver}' + +homepage = 'https://omnibenchmark.org' +description = 'Rmarkdown bundle with specific Python dependency' + +toolchain = {'name': 'system', 'version': '1.0'} + +dependencies = [ + ('rmarkdown', '0.1.0', f'-gfbf-2024a-r-{local_rver}'), + ('Python', local_pyver, '-GCCcore-13.3.0'), +] + +sanity_check_paths = { + 'files': [], + 'dirs': ['../../rmarkdown/0.1.0-gfbf-2024a-r-4.4.2'] +} diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb new file mode 100644 index 0000000..a88a2a9 --- /dev/null +++ b/envs/rmarkdown.eb @@ -0,0 +1,94 @@ +easyblock = 'Bundle' + +# TODO(ben): Try to use https://www.eessi.io/docs/available_software/detail/R-bundle-CRAN/ +# and build only what's left out. + +name = 'rmarkdown' +version = '0.1.0' +versionsuffix = '-r-%(rver)s' + +homepage = 'https://omnibenchmark.org' +description = 'rmarkdown bundle for clustbench reports' + +toolchain = {'name': 'gfbf', 'version': '2024a'} + +dependencies = [ + ('R', '4.4.2'), +] + +exts_default_options = { + 'source_urls': [ + 'https://cloud.r-project.org/src/contrib/', + 'https://cran.r-project.org/src/contrib/', # current version of packages + 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive + 'https://www.bioconductor.org/packages/release/bioc/src/contrib/', # bioconductor + ], + 'sources': ['%(name)s_%(version)s.tar.gz'], +} + +exts_defaultclass = 'RPackage' + + +exts_list = [ + ('rlang', '1.1.6'), + ('glue', '1.8.0'), + ('cli', '3.6.4'), + ('lifecycle', '1.0.4'), + ('vctrs', '0.6.5'), + ('utf8', '1.2.4'), + ('lattice', '0.22-5'), + ('pkgconfig', '2.0.3'), + ('pillar', '1.10.2'), + ('magrittr', '2.0.3'), + ('fansi', '1.0.6'), + ('viridisLite', '0.4.2'), + ('RColorBrewer', '1.1-3'), + ('R6', '2.6.1'), + ('labeling', '0.4.3'), + ('farver', '2.1.2'), + ('Matrix', '1.7-3'), + ('nlme', '3.1-168'), + ('withr', '3.0.2'), + ('tibble', '3.2.1'), + ('colorspace', '2.1-1'), + ('munsell', '0.5.1'), + ('scales', '1.3.0'), + ('mgcv', '1.9-1'), + ('MASS', '7.3-65'), + ('isoband', '0.2.7'), + ('gtable', '0.3.6'), + ('ggplot2', '3.5.2'), + ('findpython', '1.0.9', {}), + ('argparse', '2.2.5', {}), + ('rmarkdown', '2.29', {}), + ('generics', '0.1.3', {}), + ('tidyselect', '1.2.1', {}), + ('dplyr', '1.1.4', {}), + ('tidyr', '1.3.1', {}), + ('shape', '1.4.6.1', {}), + ('GlobalOptions', '0.1.2', {}), + ('circlize', '0.4.16', {}), + ('rjson', '0.2.23', {}), + ('GetoptLong', '1.0.5', {}), + ('cluster', '2.1.8.1', {}), + ('clue', '0.3-66', {}), + ('png', '0.1-8', {}), + ('BiocGenerics', '0.54.0', {}), + ('S4Vectors', '0.46.0', {}), + ('IRanges', '2.42.0'), + ('matrixStats', '1.5.0', {}), + ('iterators', '1.0.14', {}), + ('codetools', '0.2-20', {}), + ('foreach', '1.5.2', {}), + ('doParallel', '1.0.17', {}), + ('ComplexHeatmap', '2.24.0', {}), +] + +modextrapaths = {'R_LIBS_SITE': ''} + +sanity_check_paths = { + 'files': [], + 'dirs': ['argparse', 'rmarkdown', 'ggplot2', 'tidyr', 'ComplexHeatmap'], +} + +moduleclass = 'bio' From 1b57e44585c688d6f5e8f5be4b38b039e73cab57 Mon Sep 17 00:00:00 2001 From: ben Date: Sat, 10 May 2025 18:49:27 +0200 Subject: [PATCH 15/60] inject checksums to rmarkdown easyconfig --- envs/rmarkdown.eb | 209 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 156 insertions(+), 53 deletions(-) diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb index a88a2a9..067eadd 100644 --- a/envs/rmarkdown.eb +++ b/envs/rmarkdown.eb @@ -28,60 +28,163 @@ exts_default_options = { exts_defaultclass = 'RPackage' - exts_list = [ - ('rlang', '1.1.6'), - ('glue', '1.8.0'), - ('cli', '3.6.4'), - ('lifecycle', '1.0.4'), - ('vctrs', '0.6.5'), - ('utf8', '1.2.4'), - ('lattice', '0.22-5'), - ('pkgconfig', '2.0.3'), - ('pillar', '1.10.2'), - ('magrittr', '2.0.3'), - ('fansi', '1.0.6'), - ('viridisLite', '0.4.2'), - ('RColorBrewer', '1.1-3'), - ('R6', '2.6.1'), - ('labeling', '0.4.3'), - ('farver', '2.1.2'), - ('Matrix', '1.7-3'), - ('nlme', '3.1-168'), - ('withr', '3.0.2'), - ('tibble', '3.2.1'), - ('colorspace', '2.1-1'), - ('munsell', '0.5.1'), - ('scales', '1.3.0'), - ('mgcv', '1.9-1'), - ('MASS', '7.3-65'), - ('isoband', '0.2.7'), - ('gtable', '0.3.6'), - ('ggplot2', '3.5.2'), - ('findpython', '1.0.9', {}), - ('argparse', '2.2.5', {}), - ('rmarkdown', '2.29', {}), - ('generics', '0.1.3', {}), - ('tidyselect', '1.2.1', {}), - ('dplyr', '1.1.4', {}), - ('tidyr', '1.3.1', {}), - ('shape', '1.4.6.1', {}), - ('GlobalOptions', '0.1.2', {}), - ('circlize', '0.4.16', {}), - ('rjson', '0.2.23', {}), - ('GetoptLong', '1.0.5', {}), - ('cluster', '2.1.8.1', {}), - ('clue', '0.3-66', {}), - ('png', '0.1-8', {}), - ('BiocGenerics', '0.54.0', {}), - ('S4Vectors', '0.46.0', {}), - ('IRanges', '2.42.0'), - ('matrixStats', '1.5.0', {}), - ('iterators', '1.0.14', {}), - ('codetools', '0.2-20', {}), - ('foreach', '1.5.2', {}), - ('doParallel', '1.0.17', {}), - ('ComplexHeatmap', '2.24.0', {}), + ('rlang', '1.1.6', { + 'checksums': ['18544c876f4e18ec554edecc308362a52fbc7e0805c4794cf59bcc4d0b57f330'], + }), + ('glue', '1.8.0', { + 'checksums': ['c86f364ba899b8662f5da3e1a75f43ae081ab04e0d51171d052356e7ee4b72a0'], + }), + ('cli', '3.6.4', { + 'checksums': ['0c39539ce173bcbf7abaca64e8d2c87ffec8257c144c31b793c4cf2dd9cf7620'], + }), + ('lifecycle', '1.0.4', { + 'checksums': ['ada4d3c7e84b0c93105e888647c5754219a8334f6e1f82d5afaf83d4855b91cc'], + }), + ('vctrs', '0.6.5', { + 'checksums': ['43167d2248fd699594044b5c8f1dbb7ed163f2d64761e08ba805b04e7ec8e402'], + }), + ('utf8', '1.2.4', { + 'checksums': ['418f824bbd9cd868d2d8a0d4345545c62151d321224cdffca8b1ffd98a167b7d'], + }), + ('lattice', '0.22-5', { + 'checksums': ['ba1fbe5e18a133507dca9851b7f933002bdb6d1f3ea5f410a0a441103b6da5f1'], + }), + ('pkgconfig', '2.0.3', { + 'checksums': ['330fef440ffeb842a7dcfffc8303743f1feae83e8d6131078b5a44ff11bc3850'], + }), + ('pillar', '1.10.2', { + 'checksums': ['2cdbe3fe1b28b62530880ab26fc3c874e0dd5060767ae1a8ee5685f65e56d645'], + }), + ('magrittr', '2.0.3', { + 'checksums': ['a2bff83f792a1acb801bfe6330bb62724c74d5308832f2cb6a6178336ace55d2'], + }), + ('fansi', '1.0.6', { + 'checksums': ['ea9dc690dfe50a7fad7c5eb863c157d70385512173574c56f4253b6dfe431863'], + }), + ('viridisLite', '0.4.2', { + 'checksums': ['893f111d31deccd2cc959bc9db7ba2ce9020a2dd1b9c1c009587e449c4cce1a1'], + }), + ('RColorBrewer', '1.1-3', { + 'checksums': ['4f42f5423c45688b39f492c7892d93f37b4541831c8ffb140364d2bd89031ac0'], + }), + ('R6', '2.6.1', { + 'checksums': ['59c6eba8b1b912eb7e104f65053235604be853425ee67c152ac4e86a1f2073b4'], + }), + ('labeling', '0.4.3', { + 'checksums': ['c62f4fc2cc74377d7055903c5f1913b7295f7587456fe468592738a483e264f2'], + }), + ('farver', '2.1.2', { + 'checksums': ['528823b95daab4566137711f1c842027a952bea1b2ae6ff098e2ca512b17fe25'], + }), + ('Matrix', '1.7-3', { + 'checksums': ['6642e9db8cddf32a051972fd5a634bf7edbdc925c5c2d139bf71e92df00fb44e'], + }), + ('nlme', '3.1-168', { + 'checksums': ['23b78468344cb6775dee5e0d9c8133032d64f08ebaba20776508a0443a897362'], + }), + ('withr', '3.0.2', { + 'checksums': ['0a3a05f493d275cca4bf13c8c1b95a1a4eed7f83b2493f41fde02ce3fc92c1a3'], + }), + ('tibble', '3.2.1', { + 'checksums': ['65a72d0c557fd6e7c510d150c935ed6ced5db7d05fc20236b370f11428372131'], + }), + ('colorspace', '2.1-1', { + 'checksums': ['e721cee5f4d6e4b0fc8eb18265e316b4f856fd3be02f0775a26032663758cd0b'], + }), + ('munsell', '0.5.1', { + 'checksums': ['03a2fd9ac40766cded96dfe33b143d872d0aaa262a25482ce19161ca959429a6'], + }), + ('scales', '1.3.0', { + 'checksums': ['b33e0f6b44259551ce02befd52eac53602509fbfdd903920620c658c50f35888'], + }), + ('mgcv', '1.9-1', { + 'checksums': ['700fbc37bedd3a49505b9bc4949faee156d9cfb4f669d797d06a10a15a5bdb32'], + }), + ('MASS', '7.3-65', { + 'checksums': ['b07ef1e3c364ce56269b4a8a7759cc9f87c876554f91293437bb578cfe38172f'], + }), + ('isoband', '0.2.7', { + 'checksums': ['7693223343b45b86de2b5b638ff148f0dafa6d7b1237e822c5272902f79cdf61'], + }), + ('gtable', '0.3.6', { + 'checksums': ['d305a5fa11278b649d2d8edc5288bf28009be888a42be58ff8714018e49de0ef'], + }), + ('ggplot2', '3.5.2', { + 'checksums': ['0a30024a2ff3e569412223c8f14563ed504f3e0851de03e42d1b5f73fe1f06bf'], + }), + ('findpython', '1.0.9', { + 'checksums': ['b6a15e0cdfcdd4b1cfc76f7e4eaad0125d4d52889711200075280e9b2a2cb7cb'], + }), + ('argparse', '2.2.5', { + 'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'], + }), + (name, '2.29', { + 'checksums': ['6662ac85316c869caad6e3b95468cad97f6eef106d47b066db8d40c05a490928'], + }), + ('generics', '0.1.3', { + 'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'], + }), + ('tidyselect', '1.2.1', { + 'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'], + }), + ('dplyr', '1.1.4', { + 'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'], + }), + ('tidyr', '1.3.1', { + 'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'], + }), + ('shape', '1.4.6.1', { + 'checksums': ['43f9bd0f997fd6cf1838efd8b2509c9a6396513f4e54a20360481634affd22a4'], + }), + ('GlobalOptions', '0.1.2', { + 'checksums': ['47890699668cfa9900a829c51f8a32e02a7a7764ad07cfac972aad66f839753e'], + }), + ('circlize', '0.4.16', { + 'checksums': ['16dc32c7704906d13a9e5281bb396e92fb89a6b17fa5e201953240726b650b67'], + }), + ('rjson', '0.2.23', { + 'checksums': ['55034575c854ed657e6701da278c0fdea251479624d06a963b2e58461a5f0f48'], + }), + ('GetoptLong', '1.0.5', { + 'checksums': ['8c237986ed3dfb72d956ad865ef7768644eebf144675ad66140acfd1aca9d701'], + }), + ('cluster', '2.1.8.1', { + 'checksums': ['4b95b78e09b17ddca72edc0bb180c753c004ed2f61c3eb12e0451ac77f441e57'], + }), + ('clue', '0.3-66', { + 'checksums': ['aa86dd58c05635eb394c9ede0dd15a4f24af4815f299451bbc7895c0f737c2fb'], + }), + ('png', '0.1-8', { + 'checksums': ['5a36fabb6d62ba2533d3fc4cececd07891942cfb76fe689ec0d550d08762f61c'], + }), + ('BiocGenerics', '0.54.0', { + 'checksums': ['413d6f74cbc671147f63eefc46b718af815d6497535c2198925d9306e00c41b9'], + }), + ('S4Vectors', '0.46.0', { + 'checksums': ['c34249c6a367a2a1e94158d9e60294f2b901e485d93717250a417569be187a40'], + }), + ('IRanges', '2.42.0', { + 'checksums': ['0abb01ee93111c5fc678f9aa2f93d00d8d1548263cb60daa52645a6061b603fc'], + }), + ('matrixStats', '1.5.0', { + 'checksums': ['12996c5f3e6fc202a43e1087f16a71b7fa93d7e908f512542c7ee89cf95dcc15'], + }), + ('iterators', '1.0.14', { + 'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'], + }), + ('codetools', '0.2-20', { + 'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'], + }), + ('foreach', '1.5.2', { + 'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'], + }), + ('doParallel', '1.0.17', { + 'checksums': ['b96a25ad105a654d70c7b4ca27290dc9967bc47f4668b2763927a886b178abd7'], + }), + ('ComplexHeatmap', '2.24.0', { + 'checksums': ['2a015ad26c5a5f003ee203d77cc8d3eea5461bcf2db7ce102da1bef7db082650'], + }), ] modextrapaths = {'R_LIBS_SITE': ''} From dfd5b936195655c136bf513640c5a5196a7785ea Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:18:59 +0200 Subject: [PATCH 16/60] update sklearn singularity definition --- envs/sklearn_singularity.def | 57 ++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def index 939a3bb..56bcf37 100644 --- a/envs/sklearn_singularity.def +++ b/envs/sklearn_singularity.def @@ -1,33 +1,54 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 +Bootstrap: docker +From: ubuntu:noble-20250404 %labels - - AUTHOR izaskun.mallona@gmail.com + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me %post + PYTHON_VERSION=3.12.6 + PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) - # Install python3.12 + # Update and enable deb-src + apt-get update + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - # virtualenv + + # Get build dependencies for Python + apt-get build-dep -y python3 + + # Extra dependencies + apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev + + # Calculate half the number of available cores + HALF_NPROC=$(( $(nproc) / 2 )) + # Ensure at least one core is used + CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) + + # Download and build Python with optimizations + wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz + tar -xf Python-${PYTHON_VERSION}.tgz + cd Python-${PYTHON_VERSION}*/ + # Enable all possible optimizations + ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib" + make -j ${CORES_TO_USE} + make altinstall + + # Create virtualenv using the locally built Python cd /opt - python3.12 -m venv "default" + /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default" . default/bin/activate - - pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43" + + # Install required packages + pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ + "isodate" "pydantic-core" \ + "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT %environment . /opt/default/bin/activate + From 0056b7fce71ab1e5efc9456502c2114ea4d597d7 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:25:21 +0200 Subject: [PATCH 17/60] factorize sklearn singularity pip block --- envs/sklearn-pip.apptainer.include | 11 +++++++++++ envs/sklearn_singularity.def | 19 +++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) create mode 100644 envs/sklearn-pip.apptainer.include diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include new file mode 100644 index 0000000..b8f48eb --- /dev/null +++ b/envs/sklearn-pip.apptainer.include @@ -0,0 +1,11 @@ + pip3 install \ + "clustering-benchmarks==1.1.6" \ + "fastcluster==1.2.6" \ + "numpy==1.26.4" \ + "scipy==1.14.1" \ + "isodate" \ + "pydantic-core" \ + "genieclust==1.1.6" \ + "pandas==2.2.3" \ + "gitpython==3.1.43" \ + wget" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def index 56bcf37..cb9a2f6 100644 --- a/envs/sklearn_singularity.def +++ b/envs/sklearn_singularity.def @@ -8,25 +8,25 @@ From: ubuntu:noble-20250404 %post PYTHON_VERSION=3.12.6 PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) - + # Update and enable deb-src apt-get update echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list apt-get update - + # Get build dependencies for Python apt-get build-dep -y python3 # Extra dependencies apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev - + # Calculate half the number of available cores HALF_NPROC=$(( $(nproc) / 2 )) # Ensure at least one core is used CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) - + # Download and build Python with optimizations wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz @@ -35,16 +35,15 @@ From: ubuntu:noble-20250404 ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib" make -j ${CORES_TO_USE} make altinstall - + # Create virtualenv using the locally built Python cd /opt /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default" . default/bin/activate - - # Install required packages - pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ - "isodate" "pydantic-core" \ - "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" + + # Install required packages with pip + + % include sklearn-pip.apptainer.include echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From cef3a6b6f0c0c0cb564941dee77eb52e9fd207db Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:27:48 +0200 Subject: [PATCH 18/60] extract variable in build script --- envs/build_singularity.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index c0c3d93..c5cbf6f 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,5 +1,7 @@ #!/bin/sh -singularity build sklearn.sif sklearn_singularity.def -singularity build clustbench.sif clustbench_singularity.def -singularity build r.sif r_singularity.def -singularity build fcps.sif fcps_singularity.def +CMD=singularity +BUILD=build --fakeroot +$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def +$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def +$CMD $BUILD r.sif r_singularity.def +$CMD $BUILD fcps.sif fcps_singularity.def From 2ee17ca636501521efb2c650a0b76750052692ee Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:37:19 +0200 Subject: [PATCH 19/60] revert include, should use m4 --- envs/build_singularity.sh | 2 +- envs/sklearn-pip.apptainer.include | 11 ----------- ...ularity.def => sklearn_singularity_optimized.def} | 12 +++++++++++- 3 files changed, 12 insertions(+), 13 deletions(-) delete mode 100644 envs/sklearn-pip.apptainer.include rename envs/{sklearn_singularity.def => sklearn_singularity_optimized.def} (85%) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index c5cbf6f..61fbd13 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,7 +1,7 @@ #!/bin/sh CMD=singularity BUILD=build --fakeroot -$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def +$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def $CMD $BUILD clustbench-optimized.sif clustbench_singularity.def $CMD $BUILD r.sif r_singularity.def $CMD $BUILD fcps.sif fcps_singularity.def diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include deleted file mode 100644 index b8f48eb..0000000 --- a/envs/sklearn-pip.apptainer.include +++ /dev/null @@ -1,11 +0,0 @@ - pip3 install \ - "clustering-benchmarks==1.1.6" \ - "fastcluster==1.2.6" \ - "numpy==1.26.4" \ - "scipy==1.14.1" \ - "isodate" \ - "pydantic-core" \ - "genieclust==1.1.6" \ - "pandas==2.2.3" \ - "gitpython==3.1.43" \ - wget" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity_optimized.def similarity index 85% rename from envs/sklearn_singularity.def rename to envs/sklearn_singularity_optimized.def index cb9a2f6..6d6e165 100644 --- a/envs/sklearn_singularity.def +++ b/envs/sklearn_singularity_optimized.def @@ -43,7 +43,17 @@ From: ubuntu:noble-20250404 # Install required packages with pip - % include sklearn-pip.apptainer.include + pip3 install \ + "clustering-benchmarks==1.1.6" \ + "fastcluster==1.2.6" \ + "numpy==1.26.4" \ + "scipy==1.14.1" \ + "isodate" \ + "pydantic-core" \ + "genieclust==1.1.6" \ + "pandas==2.2.3" \ + "gitpython==3.1.43" \ + wget" echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From c4cbe5c2f22ed52a4873d5a14781682a19e87a4f Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:44:30 +0200 Subject: [PATCH 20/60] update python version --- envs/sklearn_singularity_optimized.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/sklearn_singularity_optimized.def b/envs/sklearn_singularity_optimized.def index 6d6e165..17a131d 100644 --- a/envs/sklearn_singularity_optimized.def +++ b/envs/sklearn_singularity_optimized.def @@ -6,7 +6,7 @@ From: ubuntu:noble-20250404 Author ben.uzh@proton.me %post - PYTHON_VERSION=3.12.6 + PYTHON_VERSION=3.12.9 PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) # Update and enable deb-src From 21bdd666d47d029d5463f814ab685389ec850f71 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:25:31 +0200 Subject: [PATCH 21/60] do a little bit of cleanup with the multiple envs --- ....yml => Clustering_apptainer_optimized.yml | 129 +++--------------- Clustering_conda.yml | 125 +++-------------- Clustering_envmodules.yml | 25 ++-- envs/build_singularity.sh | 7 +- ...def => clustbench_apptainer_optimized.def} | 37 +++-- envs/clustbench_apptainer_vanillapy.def | 55 ++++++++ envs/clustbench_singularity.def | 35 ----- ...ity.def => fcps_singularity_optimized.def} | 11 +- envs/sklearn.yml | 11 -- 9 files changed, 145 insertions(+), 290 deletions(-) rename Clustering_singularity.yml => Clustering_apptainer_optimized.yml (74%) rename envs/{sklearn_singularity_optimized.def => clustbench_apptainer_optimized.def} (71%) create mode 100644 envs/clustbench_apptainer_vanillapy.def delete mode 100644 envs/clustbench_singularity.def rename envs/{fcps_singularity.def => fcps_singularity_optimized.def} (79%) delete mode 100644 envs/sklearn.yml diff --git a/Clustering_singularity.yml b/Clustering_apptainer_optimized.yml similarity index 74% rename from Clustering_singularity.yml rename to Clustering_apptainer_optimized.yml index c80b498..96e357e 100644 --- a/Clustering_singularity.yml +++ b/Clustering_apptainer_optimized.yml @@ -1,38 +1,32 @@ id: clustering_example_apptainer + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleapptainer +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: apptainer + software_environments: + clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + conda: envs/clustbench.yml # not used + envmodule: na + apptainer: envs/clustbench-optimized.sif + rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: envs/rmarkdown.sif + fcps: description: "CRAN's FCPS" conda: envs/fcps.yml + envmodule: na apptainer: envs/fcps.sif - envmodule: fcps + metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -45,10 +39,11 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data + ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" @@ -229,89 +224,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7ac1629..61352e1 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,38 +1,32 @@ id: clustering_example_conda + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleconda +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: conda + software_environments: + clustbench: description: "clustbench on py3.12.6" conda: envs/clustbench.yml envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + apptainer: na + rmarkdown: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: fcps # not used + apptainer: na + fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: envs/fcps.sif envmodule: fcps + apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -45,6 +39,7 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: ## clustbench data ########################################################## @@ -52,7 +47,7 @@ stages: modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -145,7 +140,7 @@ stages: - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn #url: /home/imallona/src/clustbench_sklearn @@ -229,89 +224,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index f37fd6c..52fb13e 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,28 +1,33 @@ id: clustering_example_envmodules + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -benchmark_yaml_spec: 0.5 +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 software_backend: envmodules software_environments: + clustbench: description: "clustbench on py3.12.6" + conda: envs/clustbench.yml # not used envmodule: clustbench/0.1.0-foss-2023b - conda: envs/clustbench.yml apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmakrkdown.yml # not used + envmodule: rmarkdown + apptainer: na + fcps: description: "CRAN's FCPS" + conda: envs/fcps.yml # not used envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 - conda: envs/fcps.yml - apptainer: na - rmarkdown: - description: "R with some plotting dependencies" - envmodule: rmarkdown # TODO - conda: envs/clustbench.yml apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 61fbd13..784e443 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,7 +1,6 @@ #!/bin/sh CMD=singularity BUILD=build --fakeroot -$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def -$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def -$CMD $BUILD r.sif r_singularity.def -$CMD $BUILD fcps.sif fcps_singularity.def +$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def +$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD $BUILD fcps.sif fcps_singularity_optimized.def diff --git a/envs/sklearn_singularity_optimized.def b/envs/clustbench_apptainer_optimized.def similarity index 71% rename from envs/sklearn_singularity_optimized.def rename to envs/clustbench_apptainer_optimized.def index 17a131d..d4a316d 100644 --- a/envs/sklearn_singularity_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -15,7 +15,6 @@ From: ubuntu:noble-20250404 echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list apt-get update - # Get build dependencies for Python apt-get build-dep -y python3 @@ -43,17 +42,39 @@ From: ubuntu:noble-20250404 # Install required packages with pip - pip3 install \ + pip install -U pip + + pip install \ "clustering-benchmarks==1.1.6" \ - "fastcluster==1.2.6" \ - "numpy==1.26.4" \ - "scipy==1.14.1" \ - "isodate" \ - "pydantic-core" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.2.6" \ "gitpython==3.1.43" \ - wget" + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # TODO: can we use something more maintained? + pip install --pre "python3-wget==0.0.2-beta1" + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def new file mode 100644 index 0000000..1f2b4e3 --- /dev/null +++ b/envs/clustbench_apptainer_vanillapy.def @@ -0,0 +1,55 @@ +Bootstrap: docker +From: ubuntu:noble-20250404 + +%labels + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me + +%post + # Create virtualenv using the default Python + mkdir -p /opt && cd /opt + python3.12 -m venv "default" + . default/bin/activate + + # Install required packages with pip + + pip install -U pip + + pip install \ + "clustering-benchmarks==1.1.6" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ + "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ + "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.2.6" \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # TODO: can we use something more maintained? + pip install --pre "python3-wget==0.0.2-beta1" + + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate + diff --git a/envs/clustbench_singularity.def b/envs/clustbench_singularity.def deleted file mode 100644 index 8c2ae85..0000000 --- a/envs/clustbench_singularity.def +++ /dev/null @@ -1,35 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ - "isodate" "pydantic-core" \ - "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity_optimized.def similarity index 79% rename from envs/fcps_singularity.def rename to envs/fcps_singularity_optimized.def index a4a615e..6362b9e 100644 --- a/envs/fcps_singularity.def +++ b/envs/fcps_singularity_optimized.def @@ -4,6 +4,7 @@ From: rocker/tidyverse:4.3.3 %labels AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me %post @@ -13,11 +14,11 @@ From: rocker/tidyverse:4.3.3 libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ libgsl-dev - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz + wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz + tar -xf Python-3.12.9.tgz cd Python-3.12.*/ ./configure --enable-optimizations - make -j 4 + make -j 8 make altinstall # virtualenv @@ -25,13 +26,15 @@ From: rocker/tidyverse:4.3.3 python3.12 -m venv "default" . default/bin/activate + # TODO: pin dependencies pip install gitpython==3.1.43 isodate pydantic-core ## no versioning here + ## TODO(ben): get same versions as in easyconfig Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT %environment - + . /opt/default/bin/activate diff --git a/envs/sklearn.yml b/envs/sklearn.yml deleted file mode 100644 index 258b7ea..0000000 --- a/envs/sklearn.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: sklearn -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::scikit-learn - - conda-forge::pip - - pip: - - "pandas" - - "argparse" From e8e0f7eb2313696f7494e65c20d44def8301de63 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:29:28 +0200 Subject: [PATCH 22/60] escape --- envs/build_singularity.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 784e443..2dae40a 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,6 +1,6 @@ #!/bin/sh CMD=singularity -BUILD=build --fakeroot -$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def -$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def -$CMD $BUILD fcps.sif fcps_singularity_optimized.def +BUILD='build --fakeroot' +$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def +$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def From a8336fba907ae43ce16678de97a22538addb06e6 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:42:08 +0200 Subject: [PATCH 23/60] install updated python --- envs/clustbench_apptainer_vanillapy.def | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index 1f2b4e3..5d388bf 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -6,9 +6,19 @@ From: ubuntu:noble-20250404 Author ben.uzh@proton.me %post + export DEBIAN_FRONTEND=noninteractive + apt-get update && \ + apt-get install -y \ + python3 \ + python3-venv \ + python3-pip \ + ca-certificates \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Create virtualenv using the default Python mkdir -p /opt && cd /opt - python3.12 -m venv "default" + /usr/bin/python3 -m venv "default" . default/bin/activate # Install required packages with pip @@ -46,6 +56,8 @@ From: ubuntu:noble-20250404 # TODO: can we use something more maintained? pip install --pre "python3-wget==0.0.2-beta1" + # Do some cleanup to keep the image slim + rm -rf ~/.cache echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From 518c2f6c894b097855a32f7e810783b13e9ec386 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:46:47 +0200 Subject: [PATCH 24/60] sync the two build recipes --- envs/clustbench_apptainer_optimized.def | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index d4a316d..1e934a8 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -10,6 +10,7 @@ From: ubuntu:noble-20250404 PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) # Update and enable deb-src + export DEBIAN_FRONTEND=noninteractive apt-get update echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list @@ -19,19 +20,33 @@ From: ubuntu:noble-20250404 apt-get build-dep -y python3 # Extra dependencies - apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev + apt-get install -y git \ + python-is-python3 \ + wget \ + zlib1g-dev \ + libbz2-dev \ + libssl-dev \ + libffi-dev \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* # Calculate half the number of available cores HALF_NPROC=$(( $(nproc) / 2 )) # Ensure at least one core is used CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) - # Download and build Python with optimizations + # Download and build Python from source, with optimizations + wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz cd Python-${PYTHON_VERSION}*/ + # Enable all possible optimizations - ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib" + ./configure \ + --enable-optimizations \ + --with-lto \ + --enable-shared \ + LDFLAGS="-Wl,-rpath /usr/local/lib" make -j ${CORES_TO_USE} make altinstall @@ -75,6 +90,8 @@ From: ubuntu:noble-20250404 # TODO: can we use something more maintained? pip install --pre "python3-wget==0.0.2-beta1" + # Do some cleanup to keep the image slim + rm -rf ~/.cache echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From 2f4131f08f967bf0b934a25907e21bb9d54c001c Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:09:01 +0200 Subject: [PATCH 25/60] delete source folder --- envs/clustbench_apptainer_optimized.def | 2 ++ 1 file changed, 2 insertions(+) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index 1e934a8..eda9ea6 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -37,6 +37,7 @@ From: ubuntu:noble-20250404 # Download and build Python from source, with optimizations + mkdir ~/src && cd src wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz cd Python-${PYTHON_VERSION}*/ @@ -92,6 +93,7 @@ From: ubuntu:noble-20250404 # Do some cleanup to keep the image slim rm -rf ~/.cache + rm -rf ~/src echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From c72eb273f395e9c5805ee0806cc247a77b783443 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:15:59 +0200 Subject: [PATCH 26/60] add microbenchmark for numpy operations --- microbenchmark/microbench.py | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 microbenchmark/microbench.py diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py new file mode 100644 index 0000000..3730f9e --- /dev/null +++ b/microbenchmark/microbench.py @@ -0,0 +1,67 @@ +""" +This script exercises a few common linear algebra operations in numpy. +It's intended mostly to gauge whether it makes sense to descend into +compiler optimizations for the Python binary that we ship within the SIF images, +but it can be easily repurposed for other specific microbenchmarks (i.e., numba or GPU perf gains). + +Be aware that here we're profiling simple operations; it would make sense to carefully +profile the libraries of interest to see where the computational bottlenecks really are. + +Usage: + +singularity exec clustbench-vanilla.sif python3 microbench.py +singularity exec clustbench-optimized.sif python3 microbench.py +""" +import numpy as np +import time +import json +from statistics import mean, stdev + +def run_operation(operation, func, repetitions): + timings = [] + for _ in range(repetitions): + start = time.perf_counter() + func() + elapsed = time.perf_counter() - start + timings.append(elapsed) + return { + 'operation': operation, + 'mean': mean(timings), + 'stdev': stdev(timings), + 'runs': repetitions + } + +def benchmark(repetitions=50): + np.random.seed(42) + size = 1000 + + # Create random matrices + A = np.random.rand(size, size) + B = np.random.rand(size, size) + C = A @ A.T # Ensure positive definite for Cholesky + + # Define operations + operations = [ + ('mat_mul', lambda: np.dot(A, B)), + ('svd', lambda: np.linalg.svd(A)), + ('chol_decomp', lambda: np.linalg.cholesky(C)) + ] + + results = [] + for operation, func in operations: + try: + result = run_operation(operation, func, repetitions) + except np.linalg.LinAlgError: + result = { + 'operation': operation, + 'error': 'Operation failed due to numerical instability' + } + results.append(result) + + # Output results as JSON + print(json.dumps(results, indent=2)) + +if __name__ == "__main__": + import sys + repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10 + benchmark(repetitions) From 937e45599633e3a58af28beece19f038a4fd9513 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:16:41 +0200 Subject: [PATCH 27/60] fix path --- envs/clustbench_apptainer_optimized.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index eda9ea6..19726c2 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -37,7 +37,7 @@ From: ubuntu:noble-20250404 # Download and build Python from source, with optimizations - mkdir ~/src && cd src + mkdir ~/src && cd ~/src wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz cd Python-${PYTHON_VERSION}*/ From b0bd85adfed66583b676a3f378f0577701efa5a5 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:24:30 +0200 Subject: [PATCH 28/60] default reps --- microbenchmark/microbench.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py index 3730f9e..314e66b 100644 --- a/microbenchmark/microbench.py +++ b/microbenchmark/microbench.py @@ -17,6 +17,8 @@ import json from statistics import mean, stdev +DEFAULT_REPETITIONS = 10 + def run_operation(operation, func, repetitions): timings = [] for _ in range(repetitions): @@ -31,7 +33,7 @@ def run_operation(operation, func, repetitions): 'runs': repetitions } -def benchmark(repetitions=50): +def benchmark(repetitions=DEFAULT_REPETITIONS): np.random.seed(42) size = 1000 @@ -63,5 +65,5 @@ def benchmark(repetitions=50): if __name__ == "__main__": import sys - repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10 + repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_REPETITIONS benchmark(repetitions) From 83f9b07dfcbf25dbfce67d17d92eedbe469e2bd9 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:40:30 +0200 Subject: [PATCH 29/60] refs --- microbenchmark/microbench.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py index 314e66b..6abc6ee 100644 --- a/microbenchmark/microbench.py +++ b/microbenchmark/microbench.py @@ -11,6 +11,8 @@ singularity exec clustbench-vanilla.sif python3 microbench.py singularity exec clustbench-optimized.sif python3 microbench.py + +References: https://pythonspeed.com/articles/faster-python/ """ import numpy as np import time From 744c978643ad5623de1c4b176ab887e7a6127739 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:11:30 +0200 Subject: [PATCH 30/60] duplicate the apptainer clustering yaml --- Clustering_apptainer_optimized.yml | 39 +++-- Clustering_apptainer_vanilla.yml | 223 +++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 21 deletions(-) create mode 100644 Clustering_apptainer_vanilla.yml diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml index 96e357e..a073683 100644 --- a/Clustering_apptainer_optimized.yml +++ b/Clustering_apptainer_optimized.yml @@ -1,6 +1,6 @@ -id: clustering_example_apptainer - +id: clustering_example_apptainer_optimized description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. + version: 1.5 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 @@ -10,27 +10,28 @@ software_backend: apptainer software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml # not used envmodule: na apptainer: envs/clustbench-optimized.sif + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml # not used + envmodule: na + apptainer: envs/fcps.sif + rmarkdown: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml # not used envmodule: na apptainer: envs/rmarkdown.sif - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - envmodule: na - apptainer: envs/fcps.sif metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -43,11 +44,10 @@ metric_collectors: stages: - id: data - ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -120,16 +120,13 @@ stages: - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -138,12 +135,12 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] + - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -161,8 +158,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -201,7 +198,7 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml new file mode 100644 index 0000000..46b8ea4 --- /dev/null +++ b/Clustering_apptainer_vanilla.yml @@ -0,0 +1,223 @@ +id: clustering_example_apptainer_vanilla + +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + conda: envs/clustbench.yml # not used + envmodule: na + apptainer: envs/clustbench-vanilla.sif + + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml # not used + envmodule: na + apptainer: envs/fcps.sif + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: envs/rmarkdown.sif + + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - values: ["--linkage", "average"] + - values: ["--linkage", "weighted"] + - values: ["--linkage", "median"] + - values: ["--linkage", "centroid"] + + - id: sklearn + name: sklearn + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + # - values: ["--method", "spectral"] ## too slow + - values: ["--method", "gm"] + - id: agglomerative + name: "agglomerative" + software_environment: "clustbench" + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "average"] + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: genieclust + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - values: ["--method", "gic"] + - values: ["--method", "ica"] + - id: fcps + name: "fcps" + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda + - values: ["--method", "FCPS_Minimax"] + - values: ["--method", "FCPS_MinEnergy"] + - values: ["--method", "FCPS_HDBSCAN_2"] + - values: ["--method", "FCPS_HDBSCAN_4"] + - values: ["--method", "FCPS_HDBSCAN_8"] + - values: ["--method", "FCPS_Diana"] + - values: ["--method", "FCPS_Fanny"] + - values: ["--method", "FCPS_Hardcl"] + - values: ["--method", "FCPS_Softcl"] + - values: ["--method", "FCPS_Clara"] + - values: ["--method", "FCPS_PAM"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 9132d45 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + - values: ["--metric", "adjusted_mi_score"] + - values: ["--metric", "adjusted_rand_score"] + - values: ["--metric", "fm_score"] + - values: ["--metric", "mi_score"] + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "normalized_mi_score"] + - values: ["--metric", "normalized_pivoted_accuracy"] + - values: ["--metric", "pair_sets_index"] + - values: ["--metric", "rand_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" From ec18dcf21ce6d23a1c20f88a5431d9a2c040abae Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:15:36 +0200 Subject: [PATCH 31/60] update the oras yaml. not working, just to keep in sync --- Clustering_oras.yml | 128 ++++++++------------------------------------ 1 file changed, 22 insertions(+), 106 deletions(-) diff --git a/Clustering_oras.yml b/Clustering_oras.yml index 6640461..c6f0d7e 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -1,36 +1,37 @@ -id: clustering_example +id: clustering_example_oras description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs. -version: 1.2 +version: 1.5 + benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example +benchmark_yaml_spec: 0.4 + +#storage: https://play.min.io +#storage_api: S3 +#storage_bucket_name: clustering_example + software_backend: apptainer + software_environments: + clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml + conda: envs/clustbench.yml # not used envmodule: clustbench apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest - envmodule: fcps # not true, but + fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml + conda: envs/fcps.yml # not used + envmodule: na apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest - envmodule: fcps -stages: - ## clustbench data ########################################################## + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest + +stages: - id: data modules: @@ -214,88 +215,3 @@ stages: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" From cf52a2c7b3595e25488b1f0a007e3d30045fb74b Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:21:02 +0200 Subject: [PATCH 32/60] update the rmarkdown environment --- Clustering_envmodules.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 52fb13e..a2112d4 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -18,7 +18,7 @@ software_environments: rmarkdown: description: "R with some plotting dependencies" conda: envs/rmakrkdown.yml # not used - envmodule: rmarkdown + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 apptainer: na fcps: @@ -44,7 +44,6 @@ metric_collectors: stages: - id: data - ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" From 934ce8baa625f2877a79958f6091fbd4eae4b96f Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:34:49 +0200 Subject: [PATCH 33/60] update makefile --- Clustering_conda.yml | 12 +-- Clustering_conda_smoketest.yml | 129 +++++++++++++++++++++++++++ Clustering_envmodules_smoketest.yml | 131 ++++++++++++++++++++++++++++ Makefile | 23 ++++- envs/rmarkdown.yml | 4 +- 5 files changed, 289 insertions(+), 10 deletions(-) create mode 100644 Clustering_conda_smoketest.yml create mode 100644 Clustering_envmodules_smoketest.yml diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 61352e1..7822761 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -15,18 +15,18 @@ software_environments: envmodule: clustbench apptainer: na - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - envmodule: fcps # not used - apptainer: na - fcps: description: "CRAN's FCPS" conda: envs/fcps.yml envmodule: fcps apptainer: na + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: fcps # not used + apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml new file mode 100644 index 0000000..15215d7 --- /dev/null +++ b/Clustering_conda_smoketest.yml @@ -0,0 +1,129 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: conda + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + conda: envs/rmarkdown.yml + apptainer: na + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml new file mode 100644 index 0000000..3fa8e81 --- /dev/null +++ b/Clustering_envmodules_smoketest.yml @@ -0,0 +1,131 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: envmodules + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + conda: envs/clustbench.yml + apptainer: na + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + ## clustbench methods (fastcluster) ################################################################### + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Makefile b/Makefile index 875a375..e8e942d 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,26 @@ prepare_apptainer_env: prepare_envmodules_env: cd envs && eb clustbench.eb --robot cd envs && eb fcps.eb --robot -run_with_apptainer_backend: - ${OB_CMD} -b Clustering_singularity.yml - mv out out_apptainer + cd envs && eb rmarkdown.eb --robot + +# short versions, to debug runs & environments +run_with_apptainer_backend_short: + ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml + mv out out_apptainer_short +run_with_conda_backend_short: + ${OB_CMD} -b Clustering_conda_smoketest.yml + mv out out_conda +run_with_envmodules_backend_short: + ${OB_CMD} -b Clustering_envmodules_smoketest.yml + mv out out_lmod_short + +# full versions (expect hours) +run_with_apptainer_backend_vanilla: + ${OB_CMD} -b Clustering_apptainer_vanilla.yml + mv out out_apptainer_vanilla +run_with_apptainer_backend_optimized: + ${OB_CMD} -b Clustering_apptainer_optimized.yml + mv out out_apptainer_vanilla run_with_conda_backend: ${OB_CMD} -b Clustering_conda.yml mv out out_conda diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml index e57969e..ed5c65e 100644 --- a/envs/rmarkdown.yml +++ b/envs/rmarkdown.yml @@ -7,6 +7,8 @@ dependencies: - conda-forge::python=3.12.6 - conda-forge::r-argparse - conda-forge::r-rmarkdown + - conda-forge::r-cairo + - conda-forge::r-svglite - conda-forge::r-ggplot2 - - conda-forge::r-tidyr + - conda-forge::r-tidyr - bioconda::bioconductor-complexheatmap From 3890cb48664570aa7a9878dacb299146d69ced5d Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:15:38 +0200 Subject: [PATCH 34/60] add apptainer definition for rmarkdown --- envs/build_singularity.sh | 4 +++- envs/rmarkdown.def | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 envs/rmarkdown.def diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 2dae40a..c34208b 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -2,5 +2,7 @@ CMD=singularity BUILD='build --fakeroot' $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def -$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +# enable this if you want to compare with the custom python compilation +# $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def $CMD ${BUILD} fcps.sif fcps_singularity_optimized.def +$CMD ${BUILD} rmarkdown.sif rmarkdown.def diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def new file mode 100644 index 0000000..ce7ca1e --- /dev/null +++ b/envs/rmarkdown.def @@ -0,0 +1,38 @@ +Bootstrap: docker +From: rocker/tidyverse:4.4 + +%labels + + AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me + +%post + + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3-virtualenv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # virtualenv + cd /opt + python3.12 -m venv "default" + . default/bin/activate + + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Install R packages + Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))' + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate From c80adc10844d9251572f00352795be17c01a61a3 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:29:08 +0200 Subject: [PATCH 35/60] remove unneeded dependencies --- envs/rmarkdown.def | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def index ce7ca1e..8dc75b6 100644 --- a/envs/rmarkdown.def +++ b/envs/rmarkdown.def @@ -14,7 +14,7 @@ From: rocker/tidyverse:4.4 apt-get install -y git \ python-is-python3 \ python3.12 \ - python3-virtualenv \ + python3.12-venv \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -29,7 +29,7 @@ From: rocker/tidyverse:4.4 "pydantic-core==2.34.1" # Install R packages - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))' + Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From b19a489cec78d49c57b1c2b9e6cf1c3b0604c1ca Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:31:01 +0200 Subject: [PATCH 36/60] update makefile --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e8e942d..f342949 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,9 @@ MAX_CORES ?= 10 +TIMEOUT ?= 4h + # by default, we want to run all snakemake rules even if there are failures (-k) -OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES} +OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} + prepare_apptainer_env: cd envs && ./build_singularity.sh prepare_envmodules_env: @@ -14,7 +17,7 @@ run_with_apptainer_backend_short: mv out out_apptainer_short run_with_conda_backend_short: ${OB_CMD} -b Clustering_conda_smoketest.yml - mv out out_conda + mv out out_conda_short run_with_envmodules_backend_short: ${OB_CMD} -b Clustering_envmodules_smoketest.yml mv out out_lmod_short From ebd69b79937e55a68e968d829910c5c6f3d80b70 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:38:13 +0200 Subject: [PATCH 37/60] cleanup r/fcps deps --- Clustering_conda.yml | 2 +- envs/build_singularity.sh | 2 +- ...cps_singularity_optimized.def => fcps.def} | 29 ++++++++------- envs/fcps.eb | 3 +- envs/r.yml | 12 ------ envs/r_singularity.def | 37 ------------------- 6 files changed, 19 insertions(+), 66 deletions(-) rename envs/{fcps_singularity_optimized.def => fcps.def} (59%) delete mode 100644 envs/r.yml delete mode 100644 envs/r_singularity.def diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7822761..9e74ee5 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -30,7 +30,7 @@ software_environments: metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index c34208b..f8596a7 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -4,5 +4,5 @@ BUILD='build --fakeroot' $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def # enable this if you want to compare with the custom python compilation # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def -$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def +$CMD ${BUILD} fcps.sif fcps.def $CMD ${BUILD} rmarkdown.sif rmarkdown.def diff --git a/envs/fcps_singularity_optimized.def b/envs/fcps.def similarity index 59% rename from envs/fcps_singularity_optimized.def rename to envs/fcps.def index 6362b9e..f4eefcb 100644 --- a/envs/fcps_singularity_optimized.def +++ b/envs/fcps.def @@ -1,5 +1,5 @@ Bootstrap: docker -From: rocker/tidyverse:4.3.3 +From: rocker/tidyverse:4.4 %labels @@ -8,29 +8,32 @@ From: rocker/tidyverse:4.3.3 %post - # Install python3.12 + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3.12-venv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* apt-get update apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ libgsl-dev - wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz - tar -xf Python-3.12.9.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 8 - make altinstall - # virtualenv cd /opt python3.12 -m venv "default" . default/bin/activate - # TODO: pin dependencies - pip install gitpython==3.1.43 isodate pydantic-core + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" - ## no versioning here - ## TODO(ben): get same versions as in easyconfig + # Install R packages + ## FIXME no versioning here Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT diff --git a/envs/fcps.eb b/envs/fcps.eb index 54c8c7d..4d86bdd 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -15,12 +15,11 @@ dependencies = [ ('R', '4.3.2'), ('Boost', '1.82.0'), ('GSL', '2.7'), -# ('arrow-R', '14.0.1', versionsuffix), # required by RcisTarget ] exts_default_options = { 'source_urls': [ - 'https://bioconductor.org/packages/release/bioc/src/contrib/', + 'https://bioconductor.org/packages/release/bioc/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive 'https://cran.r-project.org/src/contrib/', # current version of packages 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages diff --git a/envs/r.yml b/envs/r.yml deleted file mode 100644 index 456e139..0000000 --- a/envs/r.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: r_for_metrics -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::r-mclust - - conda-forge::r-caret - - conda-forge::r-dplyr - - conda-forge::r-readr - - conda-forge::r-argparse - diff --git a/envs/r_singularity.def b/envs/r_singularity.def deleted file mode 100644 index f1f9ec9..0000000 --- a/envs/r_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.4 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - # Install R packages - - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate From 1afaa2f2830f11563973a1ef9720753b0a47ceec Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:44:36 +0200 Subject: [PATCH 38/60] cleanup image --- envs/build_singularity.sh | 2 +- envs/fcps.def | 4 ---- envs/rmarkdown.def | 2 ++ 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index f8596a7..83203c8 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -5,4 +5,4 @@ $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def # enable this if you want to compare with the custom python compilation # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def $CMD ${BUILD} fcps.sif fcps.def -$CMD ${BUILD} rmarkdown.sif rmarkdown.def +$CMD ${BUILD} rmarkdown.sif rmarkdown.def # this one is very similar to fcps, remove diff --git a/envs/fcps.def b/envs/fcps.def index f4eefcb..922d7f8 100644 --- a/envs/fcps.def +++ b/envs/fcps.def @@ -17,10 +17,6 @@ From: rocker/tidyverse:4.4 python3.12-venv \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ - libgsl-dev # virtualenv cd /opt diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def index 8dc75b6..aa20cc1 100644 --- a/envs/rmarkdown.def +++ b/envs/rmarkdown.def @@ -1,6 +1,8 @@ Bootstrap: docker From: rocker/tidyverse:4.4 +# TODO: we could merge this one with fcps.def, no need to duplicate the image. + %labels AUTHOR izaskun.mallona@gmail.com From 9e2168a754e7a93e11867f68f4548f1415301c79 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:54:57 +0200 Subject: [PATCH 39/60] update readme --- envs/README.md | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/envs/README.md b/envs/README.md index 69aa5c1..3cab925 100644 --- a/envs/README.md +++ b/envs/README.md @@ -1,10 +1,9 @@ We distribute `Clustering.yml` runs with different backends. -- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip) -- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files. -- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images. -- `Clustering_envmodules.yml`. Easybuilt with default optimization. - +- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) +- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files. +- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry. +- `Clustering_envmodules.yml`. Easybuild backend with default optimization. ## Conda @@ -12,8 +11,7 @@ We distribute `Clustering.yml` runs with different backends. - `clustbench.yml` - `fcps.yml` -- `r.yml` -- `sklearn.yml` +- `rmarkdown.yml` ### How to build @@ -23,24 +21,25 @@ No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clusterin ### Files -- `clustbench_singularity.def` -- `fcps_singularity.def` -- `r_singularity.def` -- `sklearn_singularity.def` +The apptainer images are based in ubuntu-noble docker images. + +The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image. + +- `clustbench_apptainer_optimized.def` +- `clustbench_apptainer_vanillapy.def` +- `fcps.def` +- `rmarkdown.def` ### How to build -- `build_singularity.sh` +- `make prepare_apptainer_env` from the root folder. ## Aptainer semi-reproducible and remote -No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. +TODO: push to the registry (how?) -## Apptainer (reproducible) with easybuild - -Doing... +No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. -Lorem ipsum. ## envmodules - reproducible builds with easybuild @@ -48,11 +47,11 @@ Lorem ipsum. - `clustbench.eb` - `fcps.eb` +- `rmarkdown.eb` +- `rmarkdown-python.eb` ### How to build -1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2 -2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this -3. `python3-wget` from pypi doesn't look very well maintaned -4. `eb fcps.eb --robot` -5. `eb clustbench.eb --robot` +- `make prepare_envmodules_env` from the root folder. +- `python3-wget` from pypi doesn't look very well maintaned + From 6199c0a11bbc88a944d07e4b79bf329fc9c55990 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:00:25 +0200 Subject: [PATCH 40/60] fixes --- envs/clustbench.eb | 5 ----- envs/fcps.eb | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 0e86911..daae6dd 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -13,9 +13,6 @@ dependencies = [ ('SciPy-bundle', '2023.11'), ('matplotlib', '3.8.2'), ('scikit-learn', '1.4.0'), -# FIXME: I think this is not needed -- ben -# ('meson-python', '0.15.0'), -# ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ] exts_list = [ @@ -48,5 +45,3 @@ exts_list = [ ] moduleclass = 'bio' - - diff --git a/envs/fcps.eb b/envs/fcps.eb index 4d86bdd..692bf0b 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -13,7 +13,7 @@ builddependencies = [('pkgconf', '1.9.5')] dependencies = [ ('R', '4.3.2'), - ('Boost', '1.82.0'), + ('Boost', '1.82.0'), ('GSL', '2.7'), ] From b017cb02a71b83766f831b9bf5b4d483eb8dbe9f Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:18:33 +0200 Subject: [PATCH 41/60] apptainer smoketest --- Clustering_apptainer_vanilla.yml | 4 +- Clustering_apptainer_vanilla_smoketest.yml | 129 +++++++++++++++++++++ 2 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 Clustering_apptainer_vanilla_smoketest.yml diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml index 46b8ea4..6bc5edd 100644 --- a/Clustering_apptainer_vanilla.yml +++ b/Clustering_apptainer_vanilla.yml @@ -10,9 +10,9 @@ software_backend: apptainer software_environments: clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml # not used + description: "clustbench on py3.12.3, default python" envmodule: na + conda: envs/clustbench.yml # not used apptainer: envs/clustbench-vanilla.sif fcps: diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml new file mode 100644 index 0000000..99aff2e --- /dev/null +++ b/Clustering_apptainer_vanilla_smoketest.yml @@ -0,0 +1,129 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: na + conda: envs/clustbench.yml # not used + apptainer: envs/clustbench-vanilla.sif + + fcps: + description: "CRAN's FCPS" + envmodule: na + conda: envs/fcps.yml # not used + apptainer: envs/fcps.sif + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: na + conda: envs/rmarkdown.yml # not used + apptainer: envs/rmarkdown.sif + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" From 98777a52be5fc9500e715a42dd1f4e146bc467b6 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:21:29 +0200 Subject: [PATCH 42/60] add git in the image --- envs/clustbench_apptainer_vanillapy.def | 1 + 1 file changed, 1 insertion(+) diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index 5d388bf..63f764a 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -13,6 +13,7 @@ From: ubuntu:noble-20250404 python3-venv \ python3-pip \ ca-certificates \ + git \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* From f4ae29d1600097a42fc906557a085dea97ed8cf0 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:33:56 +0200 Subject: [PATCH 43/60] try to debug fastcluster problem --- envs/clustbench_apptainer_optimized.def | 4 ++-- envs/clustbench_apptainer_vanillapy.def | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index 19726c2..8fc7e08 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -58,7 +58,7 @@ From: ubuntu:noble-20250404 # Install required packages with pip - pip install -U pip + pip install -U pip wheel pip install \ "clustering-benchmarks==1.1.6" \ @@ -83,7 +83,7 @@ From: ubuntu:noble-20250404 "six==1.17.0" \ "threadpoolctl==3.6.0" \ "tzdata==2025.2" \ - "fastcluster==1.2.6" \ + "fastcluster==1.3.0" \ "gitpython==3.1.43" \ "isodate==0.7.2" \ "pydantic-core==2.34.1" diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index 63f764a..ff9dd91 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -24,7 +24,7 @@ From: ubuntu:noble-20250404 # Install required packages with pip - pip install -U pip + pip install -U pip wheel pip install \ "clustering-benchmarks==1.1.6" \ @@ -49,7 +49,7 @@ From: ubuntu:noble-20250404 "six==1.17.0" \ "threadpoolctl==3.6.0" \ "tzdata==2025.2" \ - "fastcluster==1.2.6" \ + "fastcluster==1.3.0" \ "gitpython==3.1.43" \ "isodate==0.7.2" \ "pydantic-core==2.34.1" From 72cdc598acfd10c2fd73bee49f7b66fdd6a62591 Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 14 May 2025 13:26:23 +0200 Subject: [PATCH 44/60] fail if the exit code fails --- .github/workflows/benchmark.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2a55846..e22b368 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -59,7 +59,7 @@ jobs: - name: Run benchmark shell: bash -l {0} - continue-on-error: true + continue-on-error: false run: | echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error @@ -98,7 +98,7 @@ jobs: - name: Deploy to GitHub Pages uses: actions/deploy-pages@v4 - + - name: Create Job Summary if: always() run: | @@ -106,4 +106,3 @@ jobs: echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY echo "### All Outputs" >> $GITHUB_STEP_SUMMARY echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY - From 01243de1b555e2e5d4e7b31228d66d8a335edcb3 Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 14 May 2025 13:29:16 +0200 Subject: [PATCH 45/60] use conda short for test --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index e22b368..b6cb977 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -61,7 +61,7 @@ jobs: shell: bash -l {0} continue-on-error: false run: | - echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error + echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error upload-artifact: name: Benchmark Artifact From 7b213c9151be563d0fedcbfc9a46e6b10e4e8b2c Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 16 May 2025 12:28:39 +0200 Subject: [PATCH 46/60] remove rmarkdown dedicated singularity image --- Clustering.yaml | 3 +- Clustering_apptainer_optimized.yml | 5 +-- Clustering_apptainer_vanilla.yml | 5 +-- Clustering_apptainer_vanilla_smoketest.yml | 3 +- Clustering_conda.yml | 3 +- Clustering_conda_smoketest.yml | 3 +- Clustering_envmodules.yml | 3 +- Clustering_envmodules_smoketest.yml | 3 +- Clustering_oras.yml | 3 +- envs/build_singularity.sh | 3 +- envs/fcps.def | 2 +- envs/rmarkdown.def | 40 ---------------------- 12 files changed, 22 insertions(+), 54 deletions(-) delete mode 100644 envs/rmarkdown.def diff --git a/Clustering.yaml b/Clustering.yaml index 689be2c..778675e 100644 --- a/Clustering.yaml +++ b/Clustering.yaml @@ -56,7 +56,8 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml index a073683..0a479e5 100644 --- a/Clustering_apptainer_optimized.yml +++ b/Clustering_apptainer_optimized.yml @@ -25,7 +25,7 @@ software_environments: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml # not used envmodule: na - apptainer: envs/rmarkdown.sif + apptainer: envs/fcps.sif # we reuse fcps env metric_collectors: @@ -50,7 +50,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml index 6bc5edd..49e188c 100644 --- a/Clustering_apptainer_vanilla.yml +++ b/Clustering_apptainer_vanilla.yml @@ -25,7 +25,7 @@ software_environments: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml # not used envmodule: na - apptainer: envs/rmarkdown.sif + apptainer: envs/fcps.sif # we reuse fcps env metric_collectors: @@ -50,7 +50,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml index 99aff2e..0a2139f 100644 --- a/Clustering_apptainer_vanilla_smoketest.yml +++ b/Clustering_apptainer_vanilla_smoketest.yml @@ -47,7 +47,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 outputs: diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 9e74ee5..17e48f0 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -50,7 +50,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml index 15215d7..9f66440 100644 --- a/Clustering_conda_smoketest.yml +++ b/Clustering_conda_smoketest.yml @@ -47,7 +47,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 outputs: diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index a2112d4..43c24fa 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -50,7 +50,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: [ diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml index 3fa8e81..27570bc 100644 --- a/Clustering_envmodules_smoketest.yml +++ b/Clustering_envmodules_smoketest.yml @@ -47,7 +47,8 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 outputs: diff --git a/Clustering_oras.yml b/Clustering_oras.yml index c6f0d7e..ff2736b 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -40,7 +40,8 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 83203c8..a8cd330 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,8 +1,7 @@ #!/bin/sh CMD=singularity BUILD='build --fakeroot' -$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def # enable this if you want to compare with the custom python compilation # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def $CMD ${BUILD} fcps.sif fcps.def -$CMD ${BUILD} rmarkdown.sif rmarkdown.def # this one is very similar to fcps, remove diff --git a/envs/fcps.def b/envs/fcps.def index 922d7f8..a4996d6 100644 --- a/envs/fcps.def +++ b/envs/fcps.def @@ -30,7 +30,7 @@ From: rocker/tidyverse:4.4 # Install R packages ## FIXME no versioning here - Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' + Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "caret", "DataVisualizations", "FCPS", "cclust"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def deleted file mode 100644 index aa20cc1..0000000 --- a/envs/rmarkdown.def +++ /dev/null @@ -1,40 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.4 - -# TODO: we could merge this one with fcps.def, no need to duplicate the image. - -%labels - - AUTHOR izaskun.mallona@gmail.com - AUTHOR ben.uzh@proton.me - -%post - - # Install python (3.12 as of noble) - export DEBIAN_FRONTEND=noninteractive - apt-get update - apt-get install -y git \ - python-is-python3 \ - python3.12 \ - python3.12-venv \ - && apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install \ - "gitpython==3.1.43" \ - "isodate==0.7.2" \ - "pydantic-core==2.34.1" - - # Install R packages - Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate From ce34ee2ebdef1dd5366a07c5226853d92ed1a084 Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 16 May 2025 13:22:44 +0200 Subject: [PATCH 47/60] remove wget dependency from apptainer, add upload script --- envs/build_singularity.sh | 4 ++++ envs/clustbench_apptainer_optimized.def | 3 --- envs/clustbench_apptainer_vanillapy.def | 3 --- envs/upload_to_registry.sh | 12 ++++++++++++ 4 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 envs/upload_to_registry.sh diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index a8cd330..430ed30 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,4 +1,8 @@ #!/bin/sh +# Builds singularity images. +# Installation guide: check https://apptainer.org/docs/user/latest/quick_start.html#installation +# Additionally, you will need: +# apt install fakeroot uidmap CMD=singularity BUILD='build --fakeroot' # enable this if you want to compare with the custom python compilation diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index 8fc7e08..846bae3 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -88,9 +88,6 @@ From: ubuntu:noble-20250404 "isodate==0.7.2" \ "pydantic-core==2.34.1" - # TODO: can we use something more maintained? - pip install --pre "python3-wget==0.0.2-beta1" - # Do some cleanup to keep the image slim rm -rf ~/.cache rm -rf ~/src diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index ff9dd91..a40366c 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -54,9 +54,6 @@ From: ubuntu:noble-20250404 "isodate==0.7.2" \ "pydantic-core==2.34.1" - # TODO: can we use something more maintained? - pip install --pre "python3-wget==0.0.2-beta1" - # Do some cleanup to keep the image slim rm -rf ~/.cache diff --git a/envs/upload_to_registry.sh b/envs/upload_to_registry.sh new file mode 100644 index 0000000..7e45e5a --- /dev/null +++ b/envs/upload_to_registry.sh @@ -0,0 +1,12 @@ +#!/bin/sh +USER=user +REGISTRY=quay.io +ORGANIZATION=omnibenchmark +CLUSTBENCH_REPO=clustbench-vanilla +CLUSTBENCH_TAG=0.1.0 +FCPS_REPO=fcps +FCPS_TAG=0.1.0 + +singularity registry login --username {$USER} docker://${REGISTRY} +singularity push ${CLUSTBENCH_REPO}.sif oras://${REGISTRY}/${ORGANIZATION}/${CLUSTBENCH_REPO}:${CLUSTBENCH_TAG} +singularity push ${FCPS_REPO}.sif oras://${REGISTRY}/${ORGANIZATION}/${FCPS_REPO}:${FCPS_TAG} From 0fe20a635adea19663dc81772fd75fe87dc8acdf Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 16 May 2025 13:23:05 +0200 Subject: [PATCH 48/60] remove python3-wget --- envs/README.md | 9 ++------- envs/clustbench.eb | 5 ----- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/envs/README.md b/envs/README.md index 3cab925..4c68a1c 100644 --- a/envs/README.md +++ b/envs/README.md @@ -28,18 +28,14 @@ The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks - `clustbench_apptainer_optimized.def` - `clustbench_apptainer_vanillapy.def` - `fcps.def` -- `rmarkdown.def` ### How to build - `make prepare_apptainer_env` from the root folder. -## Aptainer semi-reproducible and remote - -TODO: push to the registry (how?) - -No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. +## Aptainer semi-reproducible with registry pull +No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://quay.io/omnibenchmark registry. ## envmodules - reproducible builds with easybuild @@ -53,5 +49,4 @@ No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml ### How to build - `make prepare_envmodules_env` from the root folder. -- `python3-wget` from pypi doesn't look very well maintaned diff --git a/envs/clustbench.eb b/envs/clustbench.eb index daae6dd..7064c67 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -28,11 +28,6 @@ exts_list = [ ('hurry.filesize', '0.9', { 'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'], }), - ('python3-wget', '0.0.2-beta1', { - 'modulename': 'wget', - 'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'], - 'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'], - }), ('genieclust', '1.1.6', { 'download_dep_fail': False, 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', From d6731437de350627a028310a6ce86e4a8de4916a Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 16 May 2025 13:32:12 +0200 Subject: [PATCH 49/60] add yaml for registry runs --- ...s.yml => Clustering_apptainer_registry.yml | 59 ++++---- Clustering_apptainer_registry_smoketest.yml | 133 ++++++++++++++++++ Clustering_apptainer_vanilla.yml | 2 +- Makefile | 5 + envs/README.md | 6 +- 5 files changed, 177 insertions(+), 28 deletions(-) rename Clustering_oras.yml => Clustering_apptainer_registry.yml (88%) create mode 100644 Clustering_apptainer_registry_smoketest.yml diff --git a/Clustering_oras.yml b/Clustering_apptainer_registry.yml similarity index 88% rename from Clustering_oras.yml rename to Clustering_apptainer_registry.yml index ff2736b..7e090e3 100644 --- a/Clustering_oras.yml +++ b/Clustering_apptainer_registry.yml @@ -1,48 +1,59 @@ id: clustering_example_oras -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs. + +description: Clustering benchmark on Gagolewski's. Using ORAS registry. version: 1.5 -benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 -#storage: https://play.min.io -#storage_api: S3 -#storage_bucket_name: clustering_example - software_backend: apptainer software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.3, default python" + envmodule: na conda: envs/clustbench.yml # not used - envmodule: clustbench - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" conda: envs/fcps.yml # not used envmodule: na - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 rmarkdown: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml # not used envmodule: na - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" stages: - id: data modules: - id: clustbench - name: "clustbench datasets" - software_environment: "clustbench" + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - parameters: + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -104,23 +115,20 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 outputs: - id: data.matrix path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -129,12 +137,12 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] + - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -152,8 +160,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -192,7 +200,7 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -215,4 +223,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - diff --git a/Clustering_apptainer_registry_smoketest.yml b/Clustering_apptainer_registry_smoketest.yml new file mode 100644 index 0000000..7aae229 --- /dev/null +++ b/Clustering_apptainer_registry_smoketest.yml @@ -0,0 +1,133 @@ +id: clustering_example_oras + +description: Clustering benchmark on Gagolewski's. Using ORAS registry. +version: 1.5 + +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: na + conda: envs/clustbench.yml # not used + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml # not used + envmodule: na + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml index 49e188c..f11b80a 100644 --- a/Clustering_apptainer_vanilla.yml +++ b/Clustering_apptainer_vanilla.yml @@ -38,7 +38,7 @@ metric_collectors: inputs: - metrics.scores outputs: - - id: plotting.html + id: plotting.html path: "{input}/{name}/plotting_report.html" stages: diff --git a/Makefile b/Makefile index f342949..27029dd 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,8 @@ prepare_envmodules_env: cd envs && eb rmarkdown.eb --robot # short versions, to debug runs & environments +run_with_apptainer_backend_registry_short: + ${OB_CMD} -b Clustering_apptainer_registry_smoketest.yml run_with_apptainer_backend_short: ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml mv out out_apptainer_short @@ -23,6 +25,9 @@ run_with_envmodules_backend_short: mv out out_lmod_short # full versions (expect hours) +run_with_apptainer_backend_registry: + ${OB_CMD} -b Clustering_apptainer_registry.yml + mv out out_apptainer_registry run_with_apptainer_backend_vanilla: ${OB_CMD} -b Clustering_apptainer_vanilla.yml mv out out_apptainer_vanilla diff --git a/envs/README.md b/envs/README.md index 4c68a1c..bb1f174 100644 --- a/envs/README.md +++ b/envs/README.md @@ -35,7 +35,11 @@ The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks ## Aptainer semi-reproducible with registry pull -No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://quay.io/omnibenchmark registry. +No need to prepare/build anything, since it fetches the apptainer images from a remote registry: + +```bash +ob run benchmark -b Clustering_apptainer_registry.yml --local +``` ## envmodules - reproducible builds with easybuild From 765e4189449806859182976c09356f32fe856db1 Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 16 May 2025 13:46:58 +0200 Subject: [PATCH 50/60] bump clustering-benchmarks to 1.1.6 --- envs/clustbench.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/envs/clustbench.yml b/envs/clustbench.yml index 6cb6201..f894c22 100644 --- a/envs/clustbench.yml +++ b/envs/clustbench.yml @@ -6,9 +6,7 @@ dependencies: - conda-forge::python=3.12.6 - conda-forge::pip - pip: - #- "clustering-benchmarks==1.1.5" - - 'https://github.com/gagolews/clustering-benchmarks/releases/download/v1.1.5/clustering_benchmarks-1.1.5.tar.gz' - - "wget" + - "clustering-benchmarks==1.1.6" - "fastcluster==1.2.6" - "numpy==1.26.4" - "scipy==1.14.1" From 365f2a0a9f00837b2dab14a0c1853719fed4aad4 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 19 May 2025 13:22:17 +0200 Subject: [PATCH 51/60] templatize --- Clustering.yaml | 233 ------------------ Clustering_apptainer.yml | 211 ++++++++++++++++ Clustering_apptainer_optimized.yml | 37 +-- ...> Clustering_apptainer_optimized_short.yml | 37 ++- Clustering_apptainer_short.yml | 121 +++++++++ Clustering_apptainer_vanilla.yml | 43 ++-- ... => Clustering_apptainer_vanilla_short.yml | 33 +-- Clustering_conda.yml | 50 ++-- ...moketest.yml => Clustering_conda_short.yml | 34 +-- Clustering_envmodules.yml | 214 ++++------------ Clustering_envmodules_short.yml | 121 +++++++++ Makefile | 39 ++- README.md | 5 + Clustering_apptainer_registry.yml => base.yml | 21 +- envs/build_singularity.sh | 2 +- overrides/apptainer.yml | 4 + overrides/apptainer_optimized.yml | 12 + overrides/apptainer_vanilla.yml | 12 + overrides/conda.yml | 4 + overrides/envmodules.yml | 4 + .../base.yml | 19 +- 21 files changed, 668 insertions(+), 588 deletions(-) delete mode 100644 Clustering.yaml create mode 100644 Clustering_apptainer.yml rename Clustering_apptainer_vanilla_smoketest.yml => Clustering_apptainer_optimized_short.yml (85%) create mode 100644 Clustering_apptainer_short.yml rename Clustering_envmodules_smoketest.yml => Clustering_apptainer_vanilla_short.yml (88%) rename Clustering_apptainer_registry_smoketest.yml => Clustering_conda_short.yml (89%) create mode 100644 Clustering_envmodules_short.yml rename Clustering_apptainer_registry.yml => base.yml (96%) create mode 100644 overrides/apptainer.yml create mode 100644 overrides/apptainer_optimized.yml create mode 100644 overrides/apptainer_vanilla.yml create mode 100644 overrides/conda.yml create mode 100644 overrides/envmodules.yml rename Clustering_conda_smoketest.yml => smoketest/base.yml (92%) diff --git a/Clustering.yaml b/Clustering.yaml deleted file mode 100644 index 778675e..0000000 --- a/Clustering.yaml +++ /dev/null @@ -1,233 +0,0 @@ -id: clustering_example -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.2 -benchmarker: "Izaskun Mallona, Daniel Incicau" -benchmark_yaml_spec: 0.04 -# storage: https://play.min.io -# storage_api: S3 -# storage_bucket_name: clustering_example -software_backend: conda -software_environments: - clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: "rmarkdown" - repository: - url: https://github.com/imallona/clustering_report - commit: 1d6bdf5 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{input}/{name}/plotting_report.html" - -stages: - ## clustbench data ########################################################## - - - id: data - modules: - - id: clustbench - name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_data - commit: 31ac323 - - parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 - outputs: - - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - - id: clustering - modules: - - id: fastcluster - name: "fastcluster algorithm" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" - parameters: - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - # - values: ["--linkage", "average"] - # - values: ["--linkage", "weighted"] - # - values: ["--linkage", "median"] - # - values: ["--linkage", "centroid"] - - id: sklearn - name: "sklearn" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 - parameters: - - values: ["--method", "birch"] - - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - # - values: ["--method", "gm"] - - id: agglomerative - name: "agglomerative" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 - parameters: - # - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - id: genieclust - name: "genieclust" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 - parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - # - values: ["--method", "ica"] - - id: fcps - name: "fcps" - software_environment: "fcps" - repository: - url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f - parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - # - values: ["--method", "FCPS_HDBSCAN_2"] - # - values: ["--method", "FCPS_HDBSCAN_4"] - # - values: ["--method", "FCPS_HDBSCAN_8"] - # - values: ["--method", "FCPS_Diana"] - # - values: ["--method", "FCPS_Fanny"] - # - values: ["--method", "FCPS_Hardcl"] - # - values: ["--method", "FCPS_Softcl"] - # - values: ["--method", "FCPS_Clara"] - # - values: ["--method", "FCPS_PAM"] - inputs: - - entries: - - data.matrix - - data.true_labels - outputs: - - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - - id: metrics - modules: - - id: partition_metrics - name: "clustbench partition metrics" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 - parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - # - values: ["--metric", "adjusted_mi_score"] - # - values: ["--metric", "adjusted_rand_score"] - # - values: ["--metric", "fm_score"] - # - values: ["--metric", "mi_score"] - # - values: ["--metric", "normalized_clustering_accuracy"] - # - values: ["--metric", "normalized_mi_score"] - # - values: ["--metric", "normalized_pivoted_accuracy"] - # - values: ["--metric", "pair_sets_index"] - # - values: ["--metric", "rand_score"] - inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels - outputs: - - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_apptainer.yml b/Clustering_apptainer.yml new file mode 100644 index 0000000..e075d81 --- /dev/null +++ b/Clustering_apptainer.yml @@ -0,0 +1,211 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 +software_environments: + clustbench: + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml + envmodule: clustbench/0.1.0-foss-2023b + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - values: ["--linkage", "average"] + - values: ["--linkage", "weighted"] + - values: ["--linkage", "median"] + - values: ["--linkage", "centroid"] + - id: sklearn + name: sklearn + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + # - values: ["--method", "spectral"] ## too slow + - values: ["--method", "gm"] + - id: agglomerative + name: "agglomerative" + software_environment: "clustbench" + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "average"] + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: genieclust + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - values: ["--method", "gic"] + - values: ["--method", "ica"] + - id: fcps + name: "fcps" + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda + - values: ["--method", "FCPS_Minimax"] + - values: ["--method", "FCPS_MinEnergy"] + - values: ["--method", "FCPS_HDBSCAN_2"] + - values: ["--method", "FCPS_HDBSCAN_4"] + - values: ["--method", "FCPS_HDBSCAN_8"] + - values: ["--method", "FCPS_Diana"] + - values: ["--method", "FCPS_Fanny"] + - values: ["--method", "FCPS_Hardcl"] + - values: ["--method", "FCPS_Softcl"] + - values: ["--method", "FCPS_Clara"] + - values: ["--method", "FCPS_PAM"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 9132d45 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + - values: ["--metric", "adjusted_mi_score"] + - values: ["--metric", "adjusted_rand_score"] + - values: ["--metric", "fm_score"] + - values: ["--metric", "mi_score"] + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "normalized_mi_score"] + - values: ["--metric", "normalized_pivoted_accuracy"] + - values: ["--metric", "pair_sets_index"] + - values: ["--metric", "rand_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_oras +description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry. +software_backend: apptainer diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml index 0a479e5..d536ddc 100644 --- a/Clustering_apptainer_optimized.yml +++ b/Clustering_apptainer_optimized.yml @@ -1,33 +1,23 @@ -id: clustering_example_apptainer_optimized -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. - +# this file has been generated automatically - DO NOT EDIT BY HAND version: 1.5 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 - -software_backend: apptainer - software_environments: - clustbench: description: "clustbench on py3.12.9, optimized python build" - conda: envs/clustbench.yml # not used - envmodule: na + conda: envs/clustbench.yml + envmodule: clustbench/0.1.0-foss-2023b apptainer: envs/clustbench-optimized.sif - fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml # not used - envmodule: na + conda: envs/fcps.yml + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 apptainer: envs/fcps.sif - rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml # not used - envmodule: na - apptainer: envs/fcps.sif # we reuse fcps env - - + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -40,9 +30,7 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - - id: data modules: - id: clustbench @@ -51,8 +39,7 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - - parameters: # comments depict the possible cardinalities and the number of curated labelsets + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -120,7 +107,6 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - id: clustering modules: - id: fastcluster @@ -136,7 +122,6 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - - id: sklearn name: sklearn software_environment: clustbench @@ -194,7 +179,6 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics @@ -222,3 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_optimized_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python) +software_backend: apptainer diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_optimized_short.yml similarity index 85% rename from Clustering_apptainer_vanilla_smoketest.yml rename to Clustering_apptainer_optimized_short.yml index 0a2139f..5bbd791 100644 --- a/Clustering_apptainer_vanilla_smoketest.yml +++ b/Clustering_apptainer_optimized_short.yml @@ -1,31 +1,23 @@ -id: clustering_example_envmodules -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.5 - -software_backend: apptainer - software_environments: - clustbench: description: "clustbench on py3.12.3, default python" - envmodule: na - conda: envs/clustbench.yml # not used - apptainer: envs/clustbench-vanilla.sif - + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: envs/clustbench-optimized.sif fcps: description: "CRAN's FCPS" - envmodule: na - conda: envs/fcps.yml # not used + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml apptainer: envs/fcps.sif - rmarkdown: description: "R with some plotting dependencies" - envmodule: na - conda: envs/rmarkdown.yml # not used - apptainer: envs/rmarkdown.sif - + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -38,7 +30,6 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - id: data modules: @@ -48,7 +39,6 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 outputs: @@ -56,7 +46,6 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - id: clustering modules: - id: fastcluster @@ -109,7 +98,6 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics @@ -128,3 +116,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_optimized_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python) +software_backend: apptainer diff --git a/Clustering_apptainer_short.yml b/Clustering_apptainer_short.yml new file mode 100644 index 0000000..71bdd6f --- /dev/null +++ b/Clustering_apptainer_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_oras +description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry. +software_backend: apptainer diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml index f11b80a..cd4ba56 100644 --- a/Clustering_apptainer_vanilla.yml +++ b/Clustering_apptainer_vanilla.yml @@ -1,33 +1,23 @@ -id: clustering_example_apptainer_vanilla - -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +# this file has been generated automatically - DO NOT EDIT BY HAND version: 1.5 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 - -software_backend: apptainer - software_environments: - clustbench: - description: "clustbench on py3.12.3, default python" - envmodule: na - conda: envs/clustbench.yml # not used - apptainer: envs/clustbench-vanilla.sif - + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml + envmodule: clustbench/0.1.0-foss-2023b + apptainer: envs/clustbench.sif fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml # not used - envmodule: na + conda: envs/fcps.yml + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 apptainer: envs/fcps.sif - rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml # not used - envmodule: na - apptainer: envs/fcps.sif # we reuse fcps env - - + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -38,11 +28,9 @@ metric_collectors: inputs: - metrics.scores outputs: - id: plotting.html + - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - - id: data modules: - id: clustbench @@ -51,8 +39,7 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - - parameters: # comments depict the possible cardinalities and the number of curated labelsets + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -120,7 +107,6 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - id: clustering modules: - id: fastcluster @@ -136,7 +122,6 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - - id: sklearn name: sklearn software_environment: clustbench @@ -194,7 +179,6 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics @@ -222,3 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_vanilla_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image) +software_backend: apptainer diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_apptainer_vanilla_short.yml similarity index 88% rename from Clustering_envmodules_smoketest.yml rename to Clustering_apptainer_vanilla_short.yml index 27570bc..01a1fe2 100644 --- a/Clustering_envmodules_smoketest.yml +++ b/Clustering_apptainer_vanilla_short.yml @@ -1,31 +1,23 @@ -id: clustering_example_envmodules -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.5 - -software_backend: envmodules - software_environments: - clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.3, default python" envmodule: clustbench/0.1.0-foss-2023b conda: envs/clustbench.yml - apptainer: na - + apptainer: envs/clustbench.sif fcps: description: "CRAN's FCPS" envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 conda: envs/fcps.yml - apptainer: na - + apptainer: envs/fcps.sif rmarkdown: description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 - conda: envs/clustbench.yml - apptainer: na - + apptainer: envs/fcps.sif metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -38,7 +30,6 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - id: data modules: @@ -48,7 +39,6 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 outputs: @@ -56,9 +46,6 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster @@ -111,7 +98,6 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics @@ -130,3 +116,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_vanilla_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image) +software_backend: apptainer diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 17e48f0..5fd45d2 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,32 +1,23 @@ -id: clustering_example_conda - -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +# this file has been generated automatically - DO NOT EDIT BY HAND version: 1.5 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 - -software_backend: conda - software_environments: - clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml - envmodule: clustbench - apptainer: na - + envmodule: clustbench/0.1.0-foss-2023b + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - envmodule: fcps - apptainer: na - + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 rmarkdown: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml - envmodule: fcps # not used - apptainer: na - + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -39,10 +30,7 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - ## clustbench data ########################################################## - - id: data modules: - id: clustbench @@ -51,8 +39,7 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - - parameters: # comments depict the possible cardinalities and the number of curated labelsets + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -120,17 +107,13 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -140,11 +123,10 @@ stages: - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - id: sklearn - name: "sklearn" + name: sklearn software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -162,8 +144,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -197,12 +179,11 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -225,3 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_conda +description: Clustering benchmark on Gagolewski's. Using conda. +software_backend: conda diff --git a/Clustering_apptainer_registry_smoketest.yml b/Clustering_conda_short.yml similarity index 89% rename from Clustering_apptainer_registry_smoketest.yml rename to Clustering_conda_short.yml index 7aae229..fd9ae01 100644 --- a/Clustering_apptainer_registry_smoketest.yml +++ b/Clustering_conda_short.yml @@ -1,34 +1,23 @@ -id: clustering_example_oras - -description: Clustering benchmark on Gagolewski's. Using ORAS registry. +# this file has been generated automatically - DO NOT EDIT BY HAND version: 1.5 - benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" -benchmark_yaml_spec: 0.4 - -software_backend: apptainer - +benchmark_yaml_spec: 0.5 software_environments: - clustbench: description: "clustbench on py3.12.3, default python" - envmodule: na - conda: envs/clustbench.yml # not used + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 - fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml # not used - envmodule: na + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 - rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml # not used - envmodule: na + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 - - metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -41,7 +30,6 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - id: data modules: @@ -51,7 +39,6 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 outputs: @@ -59,7 +46,6 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - id: clustering modules: - id: fastcluster @@ -112,7 +98,6 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics @@ -131,3 +116,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_conda +description: Clustering benchmark on Gagolewski's. Using conda. +software_backend: conda diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 43c24fa..805e130 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,33 +1,23 @@ -id: clustering_example_envmodules - -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +# this file has been generated automatically - DO NOT EDIT BY HAND version: 1.5 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 - -software_backend: envmodules - software_environments: - clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml # not used + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml envmodule: clustbench/0.1.0-foss-2023b - apptainer: na - - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmakrkdown.yml # not used - envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 - apptainer: na - + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml # not used + conda: envs/fcps.yml envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 - apptainer: na - - + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -40,9 +30,7 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" - stages: - - id: data modules: - id: clustbench @@ -51,133 +39,42 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 31ac323 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: [ - "--dataset_generator", - "fcps", - "--dataset_name", - "chainlink", - ] # 2 1 - - values: [ - "--dataset_generator", - "fcps", - "--dataset_name", - "engytime", - ] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: [ - "--dataset_generator", - "fcps", - "--dataset_name", - "twodiamonds", - ] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "fuzzyx", - ] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "parabolic", - ] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "ring_noisy", - ] # 2 1 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "ring_outliers", - ] # 2, 5 2 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "zigzag", - ] # 3, 5 2 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "zigzag_noisy", - ] # 3, 5 2 - - values: [ - "--dataset_generator", - "graves", - "--dataset_name", - "zigzag_outliers", - ] # 3, 5 2 - - values: [ - "--dataset_generator", - "other", - "--dataset_name", - "chameleon_t4_8k", - ] # 6 1 - - values: [ - "--dataset_generator", - "other", - "--dataset_name", - "chameleon_t5_8k", - ] # 6 1 - - values: [ - "--dataset_generator", - "other", - "--dataset_name", - "hdbscan", - ] # 6 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - - values: [ - "--dataset_generator", - "sipu", - "--dataset_name", - "aggregation", - ] # 7 1 - - values: [ - "--dataset_generator", - "sipu", - "--dataset_name", - "compound", - ] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: [ - "--dataset_generator", - "sipu", - "--dataset_name", - "pathbased", - ] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: [ - "--dataset_generator", - "sipu", - "--dataset_name", - "unbalance", - ] # 8 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: [ - "--dataset_generator", - "uci", - "--dataset_name", - "ionosphere", - ] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 @@ -186,18 +83,8 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: [ - "--dataset_generator", - "wut", - "--dataset_name", - "isolation", - ] # 3 1 - - values: [ - "--dataset_generator", - "wut", - "--dataset_name", - "labirynth", - ] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 @@ -205,24 +92,9 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: [ - "--dataset_generator", - "wut", - "--dataset_name", - "trajectories", - ] # 4 1 - - values: [ - "--dataset_generator", - "wut", - "--dataset_name", - "trapped_lovers", - ] # 3 1 - - values: [ - "--dataset_generator", - "wut", - "--dataset_name", - "twosplashes", - ] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 @@ -235,9 +107,6 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster @@ -245,7 +114,6 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -255,7 +123,7 @@ stages: - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - id: sklearn - name: "sklearn" + name: sklearn software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn @@ -267,7 +135,7 @@ stages: - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" - software_environment: clustbench + software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_agglomerative commit: 5454368 @@ -276,7 +144,7 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" + name: genieclust software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust @@ -287,7 +155,7 @@ stages: - values: ["--method", "ica"] - id: fcps name: "fcps" - software_environment: fcps + software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps commit: 272fa5f @@ -311,7 +179,6 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics @@ -319,7 +186,7 @@ stages: software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics - commit: 8184cd4 + commit: 9132d45 parameters: - values: ["--metric", "normalized_clustering_accuracy"] - values: ["--metric", "adjusted_fm_score"] @@ -339,3 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_envmodules +description: Clustering benchmark on Gagolewski's. Using envmodules. +software_backend: envmodules diff --git a/Clustering_envmodules_short.yml b/Clustering_envmodules_short.yml new file mode 100644 index 0000000..e3dc0fd --- /dev/null +++ b/Clustering_envmodules_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_envmodules +description: Clustering benchmark on Gagolewski's. Using envmodules. +software_backend: envmodules diff --git a/Makefile b/Makefile index 27029dd..16b144a 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,40 @@ MAX_CORES ?= 10 TIMEOUT ?= 4h +YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)' # by default, we want to run all snakemake rules even if there are failures (-k) OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} +APPTR = apptainer +APPTV = apptainer_vanilla +APPTO = apptainer_optimized +CONDA = conda +ENVMD = envmodules + +BASE = base.yml +BASE_SHORT = smoketest/base.yml + +# Install dependencies to generate files (requires go in the system) +deps: + go install github.com/mikefarah/yq/v4@latest + +# Generate all the yaml files from base + overrides +generate: + ${YQ_MERGE} overrides/${APPTR}.yml ${BASE} > Clustering_${APPTR}.yml + ${YQ_MERGE} overrides/${APPTV}.yml ${BASE} > Clustering_${APPTV}.yml + ${YQ_MERGE} overrides/${APPTO}.yml ${BASE} > Clustering_${APPTO}.yml + ${YQ_MERGE} overrides/${CONDA}.yml ${BASE} > Clustering_${CONDA}.yml + ${YQ_MERGE} overrides/${ENVMD}.yml ${BASE} > Clustering_${ENVMD}.yml + ${YQ_MERGE} overrides/${APPTR}.yml ${BASE_SHORT} > Clustering_${APPTR}_short.yml + ${YQ_MERGE} overrides/${APPTV}.yml ${BASE_SHORT} > Clustering_${APPTV}_short.yml + ${YQ_MERGE} overrides/${APPTO}.yml ${BASE_SHORT} > Clustering_${APPTO}_short.yml + ${YQ_MERGE} overrides/${CONDA}.yml ${BASE_SHORT} > Clustering_${CONDA}_short.yml + ${YQ_MERGE} overrides/${ENVMD}.yml ${BASE_SHORT} > Clustering_${ENVMD}_short.yml + + +clean: + rm Clustering_*.yml + prepare_apptainer_env: cd envs && ./build_singularity.sh prepare_envmodules_env: @@ -13,15 +44,15 @@ prepare_envmodules_env: # short versions, to debug runs & environments run_with_apptainer_backend_registry_short: - ${OB_CMD} -b Clustering_apptainer_registry_smoketest.yml + ${OB_CMD} -b Clustering_registry_short.yml run_with_apptainer_backend_short: - ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml + ${OB_CMD} -b Clustering_apptainer_short.yml mv out out_apptainer_short run_with_conda_backend_short: - ${OB_CMD} -b Clustering_conda_smoketest.yml + ${OB_CMD} -b Clustering_conda_short.yml mv out out_conda_short run_with_envmodules_backend_short: - ${OB_CMD} -b Clustering_envmodules_smoketest.yml + ${OB_CMD} -b Clustering_envmodules_short.yml mv out out_lmod_short # full versions (expect hours) diff --git a/README.md b/README.md index 89d7c05..ad7de1b 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,11 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo In `envs`: conda, apptainer, easybuild (lmod modules) +# Omnibenchmark YAML generation + +The current repo has base templates for different runs. +Install [yq](https://github.com/mikefarah/yq) and run `make generate` if you want to modify the base template in your tests. + # Warnings Mind we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters). diff --git a/Clustering_apptainer_registry.yml b/base.yml similarity index 96% rename from Clustering_apptainer_registry.yml rename to base.yml index 7e090e3..bfadca6 100644 --- a/Clustering_apptainer_registry.yml +++ b/base.yml @@ -1,31 +1,25 @@ -id: clustering_example_oras - -description: Clustering benchmark on Gagolewski's. Using ORAS registry. version: 1.5 - benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 -software_backend: apptainer - software_environments: clustbench: - description: "clustbench on py3.12.3, default python" - envmodule: na - conda: envs/clustbench.yml # not used + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml + envmodule: clustbench/0.1.0-foss-2023b apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml # not used - envmodule: na + conda: envs/fcps.yml + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml # not used - envmodule: na + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 @@ -223,3 +217,4 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" + diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 430ed30..099c4c1 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -7,5 +7,5 @@ CMD=singularity BUILD='build --fakeroot' # enable this if you want to compare with the custom python compilation # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def -$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def +$CMD ${BUILD} clustbench.sif clustbench_apptainer_vanillapy.def $CMD ${BUILD} fcps.sif fcps.def diff --git a/overrides/apptainer.yml b/overrides/apptainer.yml new file mode 100644 index 0000000..93b6c3e --- /dev/null +++ b/overrides/apptainer.yml @@ -0,0 +1,4 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_apptainer_oras +description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry. +software_backend: apptainer diff --git a/overrides/apptainer_optimized.yml b/overrides/apptainer_optimized.yml new file mode 100644 index 0000000..ae4b5ad --- /dev/null +++ b/overrides/apptainer_optimized.yml @@ -0,0 +1,12 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_apptainer_optimized_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python) +software_backend: apptainer + +software_environments: + clustbench: + apptainer: envs/clustbench-optimized.sif + fcps: + apptainer: envs/fcps.sif + rmarkdown: + apptainer: envs/fcps.sif diff --git a/overrides/apptainer_vanilla.yml b/overrides/apptainer_vanilla.yml new file mode 100644 index 0000000..f0d3bc4 --- /dev/null +++ b/overrides/apptainer_vanilla.yml @@ -0,0 +1,12 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_apptainer_vanilla_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image) +software_backend: apptainer + +software_environments: + clustbench: + apptainer: envs/clustbench.sif + fcps: + apptainer: envs/fcps.sif + rmarkdown: + apptainer: envs/fcps.sif diff --git a/overrides/conda.yml b/overrides/conda.yml new file mode 100644 index 0000000..5f4a1ac --- /dev/null +++ b/overrides/conda.yml @@ -0,0 +1,4 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_conda +description: Clustering benchmark on Gagolewski's. Using conda. +software_backend: conda diff --git a/overrides/envmodules.yml b/overrides/envmodules.yml new file mode 100644 index 0000000..a34d58e --- /dev/null +++ b/overrides/envmodules.yml @@ -0,0 +1,4 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_envmodules +description: Clustering benchmark on Gagolewski's. Using envmodules. +software_backend: envmodules diff --git a/Clustering_conda_smoketest.yml b/smoketest/base.yml similarity index 92% rename from Clustering_conda_smoketest.yml rename to smoketest/base.yml index 9f66440..db885fe 100644 --- a/Clustering_conda_smoketest.yml +++ b/smoketest/base.yml @@ -1,30 +1,27 @@ -id: clustering_example_envmodules -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.5 -software_backend: conda - software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.3, default python" envmodule: clustbench/0.1.0-foss-2023b conda: envs/clustbench.yml - apptainer: na + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 conda: envs/fcps.yml - apptainer: na + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 rmarkdown: description: "R with some plotting dependencies" - envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 conda: envs/rmarkdown.yml - apptainer: na + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + metric_collectors: - id: plotting From 3a178650d838571af50fc8add21a6b4e4f53ff6e Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 19 May 2025 13:34:17 +0200 Subject: [PATCH 52/60] mv folders to timestamped names --- Makefile | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 16b144a..f622128 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ deps: go install github.com/mikefarah/yq/v4@latest # Generate all the yaml files from base + overrides +.SILENT: generate generate: ${YQ_MERGE} overrides/${APPTR}.yml ${BASE} > Clustering_${APPTR}.yml ${YQ_MERGE} overrides/${APPTV}.yml ${BASE} > Clustering_${APPTV}.yml @@ -30,7 +31,9 @@ generate: ${YQ_MERGE} overrides/${APPTO}.yml ${BASE_SHORT} > Clustering_${APPTO}_short.yml ${YQ_MERGE} overrides/${CONDA}.yml ${BASE_SHORT} > Clustering_${CONDA}_short.yml ${YQ_MERGE} overrides/${ENVMD}.yml ${BASE_SHORT} > Clustering_${ENVMD}_short.yml - + echo "[+] The following files have been generated:" + ls Clustering_*.yml + echo "[+] You can use 'make clean' to delete them" clean: rm Clustering_*.yml @@ -43,31 +46,29 @@ prepare_envmodules_env: cd envs && eb rmarkdown.eb --robot # short versions, to debug runs & environments -run_with_apptainer_backend_registry_short: - ${OB_CMD} -b Clustering_registry_short.yml run_with_apptainer_backend_short: - ${OB_CMD} -b Clustering_apptainer_short.yml - mv out out_apptainer_short + ${OB_CMD} -b Clustering_${APPTR}_short.yml + mv out out_${APPTR}_short-$(shell date +'%Y%m%d%H%M') +run_with_apptainer_backend_vanilla_short: + ${OB_CMD} -b Clustering_${APPTV}_short.yml + mv out out_${APPTV}_short-$(shell date +'%Y%m%d%H%M') run_with_conda_backend_short: - ${OB_CMD} -b Clustering_conda_short.yml - mv out out_conda_short + ${OB_CMD} -b Clustering_${CONDA}_short.yml + mv out out_${CONDA}_short-$(shell date +'%Y%m%d%H%M') run_with_envmodules_backend_short: - ${OB_CMD} -b Clustering_envmodules_short.yml - mv out out_lmod_short + ${OB_CMD} -b Clustering_${ENVMD}.yml + mv out out_${ENVMD}_short-$(shell date +'%Y%m%d%H%M') # full versions (expect hours) -run_with_apptainer_backend_registry: - ${OB_CMD} -b Clustering_apptainer_registry.yml - mv out out_apptainer_registry +run_with_apptainer_backend: + ${OB_CMD} -b Clustering_${APPTR}.yml + mv out out_${APPTR}-$(shell date +'%Y%m%d%H%M') run_with_apptainer_backend_vanilla: - ${OB_CMD} -b Clustering_apptainer_vanilla.yml - mv out out_apptainer_vanilla -run_with_apptainer_backend_optimized: - ${OB_CMD} -b Clustering_apptainer_optimized.yml - mv out out_apptainer_vanilla + ${OB_CMD} -b Clustering_${APPTV}.yml + mv out out_${APPTV}-$(shell date +'%Y%m%d%H%M') run_with_conda_backend: - ${OB_CMD} -b Clustering_conda.yml - mv out out_conda + ${OB_CMD} -b Clustering_${CONDA}.yml + mv out out_${CONDA}-$(shell date +'%Y%m%d%H%M') run_with_envmodules_backend: - ${OB_CMD} -b Clustering_envmodules.yml - mv out out_lmod + ${OB_CMD} -b Clustering_${ENVMD}.yml + mv out out_${ENVMD}-$(shell date +'%Y%m%d%H%M') From f6caebd9d84d8dbe92e792d60cfe4c5bf0f83849 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 19 May 2025 13:51:14 +0200 Subject: [PATCH 53/60] add --yes flag --- .github/workflows/benchmark.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b6cb977..7e2a6fc 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -61,7 +61,7 @@ jobs: shell: bash -l {0} continue-on-error: false run: | - echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error + ob run benchmark -b Clustering_conda_short.yml --local --cores 3 --continue-on-error --yes upload-artifact: name: Benchmark Artifact diff --git a/Makefile b/Makefile index f622128..895285a 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ TIMEOUT ?= 4h YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)' # by default, we want to run all snakemake rules even if there are failures (-k) -OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} +OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} --yes APPTR = apptainer APPTV = apptainer_vanilla From f72d0cae0f79adb35930922d809407d243aa947f Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 19 May 2025 14:00:25 +0200 Subject: [PATCH 54/60] update readmes --- README.md | 2 +- envs/README.md | 51 +++++++++++++++++++++++++------------------------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index ad7de1b..653de7f 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ A clustering example for omnibenchmark 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/) 2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example` 3. Move into the cloned folder: `cd clustering_example` -4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). +4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --cores 6`. Choose your `Clustering_*.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). # Clustbench attribution diff --git a/envs/README.md b/envs/README.md index bb1f174..3e1f1e3 100644 --- a/envs/README.md +++ b/envs/README.md @@ -1,56 +1,57 @@ We distribute `Clustering.yml` runs with different backends. -- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) -- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files. -- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry. - `Clustering_envmodules.yml`. Easybuild backend with default optimization. +- `Clustering_apptainer.yml`. Apptainer, pinned, prebuilt remote images from [omnibenchmark's registry](https://quay.io/organization/omnibenchmark). +- `Clustering_apptainer_vanilla.yml`. Singularity, pinnned, from local SIF images. +- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) -## Conda +## envmodules - reproducible builds with easybuild ### Files -- `clustbench.yml` -- `fcps.yml` -- `rmarkdown.yml` +- `clustbench.eb` +- `fcps.eb` +- `rmarkdown.eb` +- `rmarkdown-python.eb` ### How to build -No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clustering_conda.yml --local` do it. +- `make prepare_envmodules_env` from the root folder. + +## Aptainer, pinned, with registry pull + +No need to prepare/build anything, since it fetches the apptainer images from a remote registry" + +```bash +make run_with_apptainer_backend +``` -## Apptainer semi-reproducible and local +## Apptainer, pinned, local build ### Files The apptainer images are based in ubuntu-noble docker images. -The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image. +The "optimized" flavor does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image. - `clustbench_apptainer_optimized.def` - `clustbench_apptainer_vanillapy.def` - `fcps.def` -### How to build +### How to build the SIF images - `make prepare_apptainer_env` from the root folder. -## Aptainer semi-reproducible with registry pull - -No need to prepare/build anything, since it fetches the apptainer images from a remote registry: - -```bash -ob run benchmark -b Clustering_apptainer_registry.yml --local -``` - -## envmodules - reproducible builds with easybuild +## Conda ### Files -- `clustbench.eb` -- `fcps.eb` -- `rmarkdown.eb` -- `rmarkdown-python.eb` +- `clustbench.yml` +- `fcps.yml` +- `rmarkdown.yml` ### How to build -- `make prepare_envmodules_env` from the root folder. +No need to `ob software conda pin / prepare`. Just use `ob run benchmark -b Clustering_conda.yml --local`. + From eef58c4c54de53eeb1e7210800e1bd77359c8e2b Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 19 May 2025 15:50:56 +0200 Subject: [PATCH 55/60] extract modules with yq --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 895285a..5477572 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ MAX_CORES ?= 10 TIMEOUT ?= 4h YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)' +YQ_REPOS=yq '.stages[].modules[] | .id + ": " + .repository.url + "@" + .repository.commit' # by default, we want to run all snakemake rules even if there are failures (-k) OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} --yes @@ -72,3 +73,6 @@ run_with_conda_backend: run_with_envmodules_backend: ${OB_CMD} -b Clustering_${ENVMD}.yml mv out out_${ENVMD}-$(shell date +'%Y%m%d%H%M') + +extract_modules: + @${YQ_REPOS} base.yml From f076b837cb76d6ce076e67c5c554aa8418b4c845 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 20 May 2025 13:51:26 +0200 Subject: [PATCH 56/60] envmodules short --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5477572..71b6860 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ run_with_conda_backend_short: ${OB_CMD} -b Clustering_${CONDA}_short.yml mv out out_${CONDA}_short-$(shell date +'%Y%m%d%H%M') run_with_envmodules_backend_short: - ${OB_CMD} -b Clustering_${ENVMD}.yml + ${OB_CMD} -b Clustering_${ENVMD}_short.yml mv out out_${ENVMD}_short-$(shell date +'%Y%m%d%H%M') # full versions (expect hours) From 478ef276c61056ecba3d341093d4b1e733010aa6 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 20 May 2025 13:55:45 +0200 Subject: [PATCH 57/60] bump version in README --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 653de7f..e772f13 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,12 @@ A clustering example for omnibenchmark # How to run -1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/) -2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example` +1. Install omnibenchmark: `pip install omnibenchmark>=0.2.0` +2. Clone the benchmark definition in this repository with `git clone https://github.com/omnibenchmark/clustering_example` 3. Move into the cloned folder: `cd clustering_example` -4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --cores 6`. Choose your `Clustering_*.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). +4. Run locally, with the desired degree of parallelism: + `ob run benchmark -b --local --cores 6`. + Choose your `Clustering_*.yml` specification based on the backend you want to run (conda, easybuild or apptainer). [More details about the available backends and how to build or enable them](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). # Clustbench attribution From 0896ffab1af42bce0b923b95f3ab4bb15db43f3d Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 20 May 2025 16:21:15 +0200 Subject: [PATCH 58/60] track main branch --- .github/workflows/benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 7e2a6fc..cb744c4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -48,14 +48,14 @@ jobs: shell: bash -l {0} run: | mamba install -y pip - pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev + pip install git+https://github.com/omnibenchmark/omnibenchmark.git@main - name: Load benchmark cache id: cache-benchmark uses: actions/cache@v3 with: path: out/ - key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering.yaml') }} + key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering_conda_short.yml') }} - name: Run benchmark shell: bash -l {0} From 1c6179fb6c26c7864435e53ed5f8ccd507a1bd42 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 20 May 2025 16:27:36 +0200 Subject: [PATCH 59/60] blah blah --- README.md | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index e772f13..7859113 100644 --- a/README.md +++ b/README.md @@ -9,24 +9,23 @@ A clustering example for omnibenchmark `ob run benchmark -b --local --cores 6`. Choose your `Clustering_*.yml` specification based on the backend you want to run (conda, easybuild or apptainer). [More details about the available backends and how to build or enable them](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). -# Clustbench attribution - -by Marek Gagolewski, modified by Izaskun Mallona - -# Data disclaimer - -Some datasets are commented out to speed up calculations. +# Software backends and variants -From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082): +* All needed recipes can be found under `envs`: conda, apptainer, easybuild (lmod modules) +* The `_smoketest` variants are meant for [quick testing](https://en.wikipedia.org/wiki/Smoke_testing_(software)) +* The default `apptainer` container fetches images from an online registry. +* `apptainer-vanilla` makes reference to a container image with stock python (`3.12`) +* `apptainer-optimized` makes reference to a container image with a custom compiled python (`3.12.9`), just to check if optimization flags have a noticeable effect. +* `envmodules` will need you to previously build the `.eb` easyconfigs with easybuild. We plan to make these modules publicly available in the future. +* `conda` environments will fetch software from the configured conda channels and pypi. Does not compile anything, fetches pre-built binaries (assuming there's a build in those channels for your architecture, that is) -> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1. +[More info in the envs/ folder](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). -A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h. # Summary - Data. Example datasets (not a comprehensive list, it's >79 of them): - - https://github.com/imallona/clustbench_data + - https://github.com/imallona/clustbench_data - args: ["--dataset_generator", "mnist", "--dataset_name", "fashion"] - args: ["--dataset_generator", "other", "--dataset_name", "iris"] - args: ["--dataset_generator", "mnist", "--dataset_name", "digits"] @@ -39,7 +38,7 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo - args: ["--linkage", "weighted"] - args: ["--linkage", "median"] - args: ["--linkage", "centroid"] - - https://github.com/imallona/clustbench_sklearn + - https://github.com/imallona/clustbench_sklearn - args: ["--method", "birch"] - args: ["--method", "kmeans"] - args: ["--method", "spectral"] ## too slow @@ -86,11 +85,7 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo - https://github.com/omnibenchmark-example/ward.git - https://github.com/omnibenchmark-example/ari.git - https://github.com/omnibenchmark-example/accuracy.git - - -# Software backends -In `envs`: conda, apptainer, easybuild (lmod modules) # Omnibenchmark YAML generation @@ -102,3 +97,17 @@ Install [yq](https://github.com/mikefarah/yq) and run `make generate` if you wan Mind we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters). Also, we have modules by Daniel not fully incorporated into Gagolewski's flow. + +# Data disclaimer + +Some datasets are commented out to speed up calculations. + +From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082): + +> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1. + +A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h. + +# Clustbench attribution + +by Marek Gagolewski, modified by Izaskun Mallona From b64949e7a36f00d9430ef3ddd526dec08681fc49 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 20 May 2025 16:31:53 +0200 Subject: [PATCH 60/60] add comment --- envs/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/envs/README.md b/envs/README.md index 3e1f1e3..335a9d7 100644 --- a/envs/README.md +++ b/envs/README.md @@ -3,8 +3,12 @@ We distribute `Clustering.yml` runs with different backends. - `Clustering_envmodules.yml`. Easybuild backend with default optimization. - `Clustering_apptainer.yml`. Apptainer, pinned, prebuilt remote images from [omnibenchmark's registry](https://quay.io/organization/omnibenchmark). - `Clustering_apptainer_vanilla.yml`. Singularity, pinnned, from local SIF images. +- `Clustering_apptainer_optimized.yml`. Singularity, pinnned, from local SIF images. This image compiles a custom python with optimization flags. - `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) +The `_short` variants are meant to run smoketests and see that there's no operational problems when running the environments, abnormal terminations etc. + + ## envmodules - reproducible builds with easybuild ### Files