diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index 0fbd636..0000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: Run Benchmark -concurrency: - group: ${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -on: - pull_request: - branches: - - 'main' - workflow_dispatch: - -permissions: - id-token: write - pages: write - -jobs: - run-benchmark: - name: Run Benchmark - runs-on: ubuntu-latest - ## runs-on: self-hosted - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Install Mambaforge - uses: conda-incubator/setup-miniconda@v3 - with: - miniforge-variant: Miniforge3 - use-mamba: true - activate-environment: omnibenchmark-env - python-version: "3.12" - auto-update-conda: true - channels: conda-forge - - - name: Cache environment - id: cache-env - uses: actions/cache@v3 - with: - path: | - ~/.conda/pkgs - ~/.conda/envs/omnibenchmark-env - ~/.cache/pip - key: ${{ runner.os }}-conda-pip-${{ hashFiles('requirements.txt') }} - restore-keys: | - ${{ runner.os }}-conda-pip- - - - name: Install omnibenchmark CLI - shell: bash -l {0} - run: | - mamba install -y pip - pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope - - - name: Load benchmark cache - id: cache-benchmark - uses: actions/cache@v3 - with: - path: out/ - key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering.yaml') }} - - - name: Run benchmark - shell: bash -l {0} - continue-on-error: true - run: | - echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error - - upload-artifact: - name: Benchmark Artifact - runs-on: ubuntu-latest - ## runs-on: self-hosted - needs: run-benchmark - if: always() - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Load cached output - uses: actions/cache@v3 - with: - path: out/ - key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering.yaml') }} - - - name: Prepare output - run: | - zip -r benchmark_output.zip out/ - mkdir -p gh-pages - cp out/plotting/plotting_report.html gh-pages/index.html - - - name: Upload zipped output - uses: actions/upload-artifact@v4 - with: - name: benchmark-output - path: benchmark_output.zip - retention-days: 7 - - - name: Upload Pages Artifact - uses: actions/upload-pages-artifact@v3 - with: - path: gh-pages - - - name: Deploy to GitHub Pages - uses: actions/deploy-pages@v4 - - - name: Create Job Summary - if: always() - run: | - echo "### Reports" >> $GITHUB_STEP_SUMMARY - echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY - echo "### All Outputs" >> $GITHUB_STEP_SUMMARY - echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY - diff --git a/.github/workflows/micromamba.yml b/.github/workflows/micromamba.yml.old similarity index 89% rename from .github/workflows/micromamba.yml rename to .github/workflows/micromamba.yml.old index 3c3259b..f4e7801 100644 --- a/.github/workflows/micromamba.yml +++ b/.github/workflows/micromamba.yml.old @@ -4,10 +4,9 @@ name: micromamba # cancel-in-progress: true on: - pull_request: + # pull_request: workflow_dispatch: schedule: - # - cron: "*/30 * * * *" # Runs every 30 minutes for testing - cron: "30 1 * * *" # at 1.30am ## these permissions are only for deployment to gh pages # permissions: @@ -22,7 +21,7 @@ jobs: strategy: matrix: ob_branch: [dev, reduce_install_scope, main] - micromamba-version: ['2.1.1-0', '2.0.5-0', '1.5.12-0', '1.5.8-0'] + micromamba-version: ['2.1.1-0', '1.5.12-0'] fail-fast: false concurrency: group: micromamba-${{ matrix.micromamba-version }}-${{ matrix.ob_branch }} @@ -61,7 +60,7 @@ jobs: shell: bash -l {0} run: | env - output=$( echo "y" | ob run benchmark -b Clustering.yaml --local --cores 10 2>&1 ) + output=$( echo "y" | ob run benchmark -b Clustering_conda.yml --local --cores 10 2>&1 ) status=$? if echo "$output" | grep -i 'Benchmark run has finished successfully'; then status=0 diff --git a/.github/workflows/miniconda_miniforge.yml b/.github/workflows/miniconda_miniforge.yml.old similarity index 92% rename from .github/workflows/miniconda_miniforge.yml rename to .github/workflows/miniconda_miniforge.yml.old index 2cfb10c..069e931 100644 --- a/.github/workflows/miniconda_miniforge.yml +++ b/.github/workflows/miniconda_miniforge.yml.old @@ -4,10 +4,9 @@ name: clustbench_miniforge # cancel-in-progress: true on: - pull_request: + # pull_request: workflow_dispatch: schedule: - # - cron: "*/30 * * * *" # Runs every 30 minutes for testing - cron: "30 1 * * *" # at 1.30am ## these permissions are only for deployment to gh pages @@ -70,7 +69,7 @@ jobs: shell: bash -l {0} run: | env - output=$( echo "y" | ob run benchmark -b Clustering.yaml --local --cores 10 2>&1 ) + output=$( echo "y" | ob run benchmark -b Clustering_conda.yml --local --cores 10 2>&1 ) status=$? if echo "$output" | grep -i 'Benchmark run has finished successfully'; then status=0 diff --git a/.github/workflows/oras.yml.old b/.github/workflows/oras.yml.old new file mode 100644 index 0000000..697446d --- /dev/null +++ b/.github/workflows/oras.yml.old @@ -0,0 +1,60 @@ +name: oras +# concurrency: +# group: ${{ github.head_ref || github.run_id }} +# cancel-in-progress: true + +on: + # pull_request: + workflow_dispatch: + schedule: + - cron: "30 1 * * *" # at 1.30am +## these permissions are only for deployment to gh pages +# permissions: +# id-token: write +# pages: write + +jobs: + run-benchmark-oras: + name: run_clustbench_oras + runs-on: ubuntu-latest + # runs-on: self-hosted + strategy: + matrix: + ob_branch: [main] + micromamba-version: ['2.1.1-0'] + fail-fast: false + concurrency: + group: oras-${{ matrix.micromamba-version }}-${{ matrix.ob_branch }} + cancel-in-progress: false # true + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.4.0 + + - name: Install ob (with) micromamba + uses: mamba-org/setup-micromamba@v2 + with: + cache-environment: false # true + micromamba-version: ${{ matrix.micromamba-version }} + download-micromamba: true + micromamba-binary-path: ${{ runner.temp }}/bin/micromamba-${{ matrix.micromamba-version }}/micromamba + environment-name: test-env-${{matrix.ob_branch }}-${{ matrix.micromamba-version }} + create-args: >- + python=3.12 + pip + conda + post-cleanup: environment # all + + - name: Overwrite omnibenchmark CLI to branch + shell: bash -l {0} + run: | + micromamba --version + pip install git+https://github.com/omnibenchmark/omnibenchmark.git@${{ matrix.ob_branch }} + + - name: Run benchmark + shell: bash -l {0} + run: | + ob run benchmark -b Clustering_oras.yml --local --cores 10 --yes diff --git a/.github/workflows/run_conda.yml b/.github/workflows/run_conda.yml new file mode 100644 index 0000000..01fc104 --- /dev/null +++ b/.github/workflows/run_conda.yml @@ -0,0 +1,22 @@ +name: run_on_conda +on: + push: + workflow_dispatch: + # schedule: + # - cron: "30 1 * * *" # at 1.30am + +jobs: + run_benchmark_main: + name: Run on conda + runs-on: ubuntu-latest + # runs-on: self-hosted + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Use action + uses: omnibenchmark/run_omnibenchmark@main + with: + yaml: Clustering_conda.yml + backend: conda + omnibenchmark_branch: 'dev' diff --git a/.github/workflows/run_oras.yml b/.github/workflows/run_oras.yml new file mode 100644 index 0000000..4d0dbbc --- /dev/null +++ b/.github/workflows/run_oras.yml @@ -0,0 +1,21 @@ +name: run_on_oras +on: + push: + workflow_dispatch: + # schedule: + # - cron: "30 1 * * *" # at 1.30am + +jobs: + run-benchmark: + name: Run on oras + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Use action + uses: omnibenchmark/run_omnibenchmark@main + with: + yaml: Clustering_oras.yml + backend: apptainer + omnibenchmark_branch: 'dev' diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d38534 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# image build artifacts +envs/*.sif + +# snakemake +snakemake.log +.snakemake/ + +# vim swaps +*.swp +*.swo diff --git a/Clustering.yaml b/Clustering.yaml deleted file mode 100644 index 0007ea5..0000000 --- a/Clustering.yaml +++ /dev/null @@ -1,232 +0,0 @@ -id: clustering_example -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.2 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example -software_backend: conda -software_environments: - clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: "rmarkdown" - repository: - url: https://github.com/imallona/clustering_report - commit: 1d6bdf5 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{input}/{name}/plotting_report.html" - -stages: - ## clustbench data ########################################################## - - - id: data - modules: - - id: clustbench - name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_data - commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 - outputs: - - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - - id: clustering - modules: - - id: fastcluster - name: "fastcluster algorithm" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" - parameters: - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - # - values: ["--linkage", "average"] - # - values: ["--linkage", "weighted"] - # - values: ["--linkage", "median"] - # - values: ["--linkage", "centroid"] - - id: sklearn - name: "sklearn" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 - parameters: - - values: ["--method", "birch"] - - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - # - values: ["--method", "gm"] - - id: agglomerative - name: "agglomerative" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 - parameters: - # - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - id: genieclust - name: "genieclust" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 - parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - # - values: ["--method", "ica"] - - id: fcps - name: "fcps" - software_environment: "fcps" - repository: - url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f - parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - # - values: ["--method", "FCPS_HDBSCAN_2"] - # - values: ["--method", "FCPS_HDBSCAN_4"] - # - values: ["--method", "FCPS_HDBSCAN_8"] - # - values: ["--method", "FCPS_Diana"] - # - values: ["--method", "FCPS_Fanny"] - # - values: ["--method", "FCPS_Hardcl"] - # - values: ["--method", "FCPS_Softcl"] - # - values: ["--method", "FCPS_Clara"] - # - values: ["--method", "FCPS_PAM"] - inputs: - - entries: - - data.matrix - - data.true_labels - outputs: - - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - - id: metrics - modules: - - id: partition_metrics - name: "clustbench partition metrics" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 - parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - # - values: ["--metric", "adjusted_mi_score"] - # - values: ["--metric", "adjusted_rand_score"] - # - values: ["--metric", "fm_score"] - # - values: ["--metric", "mi_score"] - # - values: ["--metric", "normalized_clustering_accuracy"] - # - values: ["--metric", "normalized_mi_score"] - # - values: ["--metric", "normalized_pivoted_accuracy"] - # - values: ["--metric", "pair_sets_index"] - # - values: ["--metric", "rand_score"] - inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels - outputs: - - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7ac1629..f7b4f1e 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,7 +1,7 @@ id: clustering_example_conda description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" storage: http://omnibenchmark.org:9000 benchmark_yaml_spec: 0.04 storage_api: S3 @@ -13,21 +13,6 @@ software_environments: conda: envs/clustbench.yml envmodule: clustbench apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but fcps: description: "CRAN's FCPS" conda: envs/fcps.yml @@ -36,10 +21,10 @@ software_environments: metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: "fcps" repository: url: https://github.com/imallona/clustering_report - commit: 1d6bdf5 + commit: bbb9d56 inputs: - metrics.scores outputs: @@ -58,67 +43,67 @@ stages: commit: 366c5a2 parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 outputs: - id: data.matrix path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" @@ -138,11 +123,11 @@ stages: commit: "45e43d3" parameters: - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - values: ["--linkage", "average"] - - values: ["--linkage", "weighted"] - - values: ["--linkage", "median"] - - values: ["--linkage", "centroid"] + # - values: ["--linkage", "ward"] + # - values: ["--linkage", "average"] + # - values: ["--linkage", "weighted"] + # - values: ["--linkage", "median"] + # - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" @@ -152,9 +137,9 @@ stages: commit: 5877378 parameters: - values: ["--method", "birch"] - - values: ["--method", "kmeans"] + # - values: ["--method", "kmeans"] # - values: ["--method", "spectral"] ## too slow - - values: ["--method", "gm"] + # - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" software_environment: "clustbench" @@ -163,8 +148,8 @@ stages: commit: 5454368 parameters: - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] + # - values: ["--linkage", "complete"] + # - values: ["--linkage", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" @@ -173,27 +158,27 @@ stages: commit: 6090043 parameters: - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - - values: ["--method", "ica"] + # - values: ["--method", "gic"] + # - values: ["--method", "ica"] - id: fcps name: "fcps" software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f + commit: fc37faa parameters: # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - - values: ["--method", "FCPS_HDBSCAN_2"] - - values: ["--method", "FCPS_HDBSCAN_4"] - - values: ["--method", "FCPS_HDBSCAN_8"] - - values: ["--method", "FCPS_Diana"] - - values: ["--method", "FCPS_Fanny"] - - values: ["--method", "FCPS_Hardcl"] - - values: ["--method", "FCPS_Softcl"] - - values: ["--method", "FCPS_Clara"] - - values: ["--method", "FCPS_PAM"] + - values: ["--method", "FCPS_Minimax", "--seed", 2] + - values: ["--method", "FCPS_MinEnergy", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2] + - values: ["--method", "FCPS_Diana", "--seed", 2] + - values: ["--method", "FCPS_Fanny", "--seed", 2] + - values: ["--method", "FCPS_Hardcl", "--seed", 2] + - values: ["--method", "FCPS_Softcl", "--seed", 2] + - values: ["--method", "FCPS_Clara", "--seed", 2] + - values: ["--method", "FCPS_PAM", "--seed", 2] inputs: - entries: - data.matrix @@ -217,11 +202,11 @@ stages: - values: ["--metric", "adjusted_rand_score"] - values: ["--metric", "fm_score"] - values: ["--metric", "mi_score"] - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "normalized_mi_score"] - - values: ["--metric", "normalized_pivoted_accuracy"] - - values: ["--metric", "pair_sets_index"] - - values: ["--metric", "rand_score"] + # - values: ["--metric", "normalized_clustering_accuracy"] + # - values: ["--metric", "normalized_mi_score"] + # - values: ["--metric", "normalized_pivoted_accuracy"] + # - values: ["--metric", "pair_sets_index"] + # - values: ["--metric", "rand_score"] inputs: - entries: - clustering.predicted_ks_range diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 3c2b8bd..a08c933 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,7 +1,7 @@ id: clustering_example_envmodules description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" storage: http://omnibenchmark.org:9000 benchmark_yaml_spec: 0.04 storage_api: S3 @@ -12,34 +12,22 @@ software_environments: description: "clustbench on py3.12.6" conda: envs/clustbench.yml envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest fcps: - description: "CRAN's FCPS" + description: "CRAN's FCPS on R 4.3.2" conda: envs/fcps.yml - apptainer: envs/fcps.sif + apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest envmodule: fcps + # rmarkdown: + # description: "rmarkdown on R 4.4.2" + # envmodule: rmarkdown metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: "fcps" repository: url: https://github.com/imallona/clustering_report - commit: 1d6bdf5 + commit: bbb9d56 inputs: - metrics.scores outputs: @@ -58,67 +46,67 @@ stages: commit: 366c5a2 parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 outputs: - id: data.matrix path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" @@ -138,11 +126,11 @@ stages: commit: "45e43d3" parameters: - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - values: ["--linkage", "average"] - - values: ["--linkage", "weighted"] - - values: ["--linkage", "median"] - - values: ["--linkage", "centroid"] + # - values: ["--linkage", "ward"] + # - values: ["--linkage", "average"] + # - values: ["--linkage", "weighted"] + # - values: ["--linkage", "median"] + # - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" @@ -152,9 +140,9 @@ stages: commit: 5877378 parameters: - values: ["--method", "birch"] - - values: ["--method", "kmeans"] + # - values: ["--method", "kmeans"] # - values: ["--method", "spectral"] ## too slow - - values: ["--method", "gm"] + # - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" software_environment: "clustbench" @@ -163,8 +151,8 @@ stages: commit: 5454368 parameters: - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] + # - values: ["--linkage", "complete"] + # - values: ["--linkage", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" @@ -173,27 +161,27 @@ stages: commit: 6090043 parameters: - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - - values: ["--method", "ica"] + # - values: ["--method", "gic"] + # - values: ["--method", "ica"] - id: fcps name: "fcps" software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f + commit: fc37faa parameters: # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - - values: ["--method", "FCPS_HDBSCAN_2"] - - values: ["--method", "FCPS_HDBSCAN_4"] - - values: ["--method", "FCPS_HDBSCAN_8"] - - values: ["--method", "FCPS_Diana"] - - values: ["--method", "FCPS_Fanny"] - - values: ["--method", "FCPS_Hardcl"] - - values: ["--method", "FCPS_Softcl"] - - values: ["--method", "FCPS_Clara"] - - values: ["--method", "FCPS_PAM"] + - values: ["--method", "FCPS_Minimax", "--seed", 2] + - values: ["--method", "FCPS_MinEnergy", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2] + - values: ["--method", "FCPS_Diana", "--seed", 2] + - values: ["--method", "FCPS_Fanny", "--seed", 2] + - values: ["--method", "FCPS_Hardcl", "--seed", 2] + - values: ["--method", "FCPS_Softcl", "--seed", 2] + - values: ["--method", "FCPS_Clara", "--seed", 2] + - values: ["--method", "FCPS_PAM", "--seed", 2] inputs: - entries: - data.matrix @@ -217,11 +205,11 @@ stages: - values: ["--metric", "adjusted_rand_score"] - values: ["--metric", "fm_score"] - values: ["--metric", "mi_score"] - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "normalized_mi_score"] - - values: ["--metric", "normalized_pivoted_accuracy"] - - values: ["--metric", "pair_sets_index"] - - values: ["--metric", "rand_score"] + # - values: ["--metric", "normalized_clustering_accuracy"] + # - values: ["--metric", "normalized_mi_score"] + # - values: ["--metric", "normalized_pivoted_accuracy"] + # - values: ["--metric", "pair_sets_index"] + # - values: ["--metric", "rand_score"] inputs: - entries: - clustering.predicted_ks_range diff --git a/Clustering_oras.yml b/Clustering_oras.yml index 6640461..ebe9e23 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -1,108 +1,110 @@ -id: clustering_example -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs. -version: 1.2 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io +id: clustering_example_oras +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +storage: http://omnibenchmark.org:9000 benchmark_yaml_spec: 0.04 storage_api: S3 -storage_bucket_name: clustering_example +storage_bucket_name: clusteringexampleoras software_backend: apptainer software_environments: clustbench: description: "clustbench on py3.12.6" conda: envs/clustbench.yml envmodule: clustbench - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest - envmodule: fcps # not true, but + apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest fcps: description: "CRAN's FCPS" conda: envs/fcps.yml apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest + ## apptainer: oras://quay.io/omnibenchmark/fcps:0.1.2 envmodule: fcps +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustering_report + commit: bbb9d56 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" stages: - ## clustbench data ########################################################## - id: data modules: - id: clustbench - name: "clustbench datasets" + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 - parameters: + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 outputs: - id: data.matrix path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" @@ -122,11 +124,11 @@ stages: commit: "45e43d3" parameters: - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - values: ["--linkage", "average"] - - values: ["--linkage", "weighted"] - - values: ["--linkage", "median"] - - values: ["--linkage", "centroid"] + # - values: ["--linkage", "ward"] + # - values: ["--linkage", "average"] + # - values: ["--linkage", "weighted"] + # - values: ["--linkage", "median"] + # - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" @@ -136,9 +138,9 @@ stages: commit: 5877378 parameters: - values: ["--method", "birch"] - - values: ["--method", "kmeans"] + # - values: ["--method", "kmeans"] # - values: ["--method", "spectral"] ## too slow - - values: ["--method", "gm"] + # - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" software_environment: "clustbench" @@ -147,8 +149,8 @@ stages: commit: 5454368 parameters: - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] + # - values: ["--linkage", "complete"] + # - values: ["--linkage", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" @@ -157,27 +159,27 @@ stages: commit: 6090043 parameters: - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - - values: ["--method", "ica"] + # - values: ["--method", "gic"] + # - values: ["--method", "ica"] - id: fcps name: "fcps" software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f + commit: fc37faa parameters: # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - - values: ["--method", "FCPS_HDBSCAN_2"] - - values: ["--method", "FCPS_HDBSCAN_4"] - - values: ["--method", "FCPS_HDBSCAN_8"] - - values: ["--method", "FCPS_Diana"] - - values: ["--method", "FCPS_Fanny"] - - values: ["--method", "FCPS_Hardcl"] - - values: ["--method", "FCPS_Softcl"] - - values: ["--method", "FCPS_Clara"] - - values: ["--method", "FCPS_PAM"] + - values: ["--method", "FCPS_Minimax", "--seed", 2] + - values: ["--method", "FCPS_MinEnergy", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2] + - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2] + - values: ["--method", "FCPS_Diana", "--seed", 2] + - values: ["--method", "FCPS_Fanny", "--seed", 2] + - values: ["--method", "FCPS_Hardcl", "--seed", 2] + - values: ["--method", "FCPS_Softcl", "--seed", 2] + - values: ["--method", "FCPS_Clara", "--seed", 2] + - values: ["--method", "FCPS_PAM", "--seed", 2] inputs: - entries: - data.matrix @@ -201,11 +203,11 @@ stages: - values: ["--metric", "adjusted_rand_score"] - values: ["--metric", "fm_score"] - values: ["--metric", "mi_score"] - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "normalized_mi_score"] - - values: ["--metric", "normalized_pivoted_accuracy"] - - values: ["--metric", "pair_sets_index"] - - values: ["--metric", "rand_score"] + # - values: ["--metric", "normalized_clustering_accuracy"] + # - values: ["--metric", "normalized_mi_score"] + # - values: ["--metric", "normalized_pivoted_accuracy"] + # - values: ["--metric", "pair_sets_index"] + # - values: ["--metric", "rand_score"] inputs: - entries: - clustering.predicted_ks_range diff --git a/Clustering_singularity.yml b/Clustering_singularity.yml deleted file mode 100644 index c80b498..0000000 --- a/Clustering_singularity.yml +++ /dev/null @@ -1,317 +0,0 @@ -id: clustering_example_apptainer -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleapptainer -software_backend: apptainer -software_environments: - clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: "rmarkdown" - repository: - url: https://github.com/imallona/clustering_report - commit: 1d6bdf5 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{input}/{name}/plotting_report.html" -stages: - ## clustbench data ########################################################## - - - id: data - modules: - - id: clustbench - name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_data - commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 - outputs: - - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - - id: clustering - modules: - - id: fastcluster - name: "fastcluster algorithm" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" - parameters: - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - values: ["--linkage", "average"] - - values: ["--linkage", "weighted"] - - values: ["--linkage", "median"] - - values: ["--linkage", "centroid"] - - id: sklearn - name: "sklearn" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 - parameters: - - values: ["--method", "birch"] - - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - - values: ["--method", "gm"] - - id: agglomerative - name: "agglomerative" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 - parameters: - - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - id: genieclust - name: "genieclust" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 - parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - - values: ["--method", "ica"] - - id: fcps - name: "fcps" - software_environment: "fcps" - repository: - url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f - parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - - values: ["--method", "FCPS_HDBSCAN_2"] - - values: ["--method", "FCPS_HDBSCAN_4"] - - values: ["--method", "FCPS_HDBSCAN_8"] - - values: ["--method", "FCPS_Diana"] - - values: ["--method", "FCPS_Fanny"] - - values: ["--method", "FCPS_Hardcl"] - - values: ["--method", "FCPS_Softcl"] - - values: ["--method", "FCPS_Clara"] - - values: ["--method", "FCPS_PAM"] - inputs: - - entries: - - data.matrix - - data.true_labels - outputs: - - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - - id: metrics - modules: - - id: partition_metrics - name: "clustbench partition metrics" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 - parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - - values: ["--metric", "adjusted_mi_score"] - - values: ["--metric", "adjusted_rand_score"] - - values: ["--metric", "fm_score"] - - values: ["--metric", "mi_score"] - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "normalized_mi_score"] - - values: ["--metric", "normalized_pivoted_accuracy"] - - values: ["--metric", "pair_sets_index"] - - values: ["--metric", "rand_score"] - inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels - outputs: - - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..622a632 --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ +MAX_CORES ?= 50 + +# without continue on error (-k) +OB_CMD=ob run benchmark --local --cores ${MAX_CORES} --yes + +# prepare_apptainer_env: +# cd envs && bash build_singularity.sh + +all: run_with_default_conda run_with_unpinned_oras run_with_default_envs # knit_report + +run_with_default_conda: + ${OB_CMD} -b Clustering_conda.yml + cp Clustering_conda.yml out + mv out out_conda_$(shell date +'%Y%m%d_%H%M') + +run_with_unpinned_oras: + ${OB_CMD} -b Clustering_oras.yml + cp Clustering_oras.yml out + mv out out_singularity_$(shell date +'%Y%m%d_%H%M') + +run_with_default_envs: + ${OB_CMD} -b Clustering_envmodules.yml + cp Clustering_envmodules.yml out + mv out out_envmodules_$(shell date +'%Y%m%d_%H%M') + +# ## derived from Mark's plots to process multiple benchmark runs at once +# knit_report: +# ## todo incorporate this report to this repo, downloading from a temporary branch `mark` is a bad idea +# ## also control the environment this is run with +# wget -nc https://raw.githubusercontent.com/imallona/clustering_report/refs/heads/mark/07_metrics_across_backends.Rmd +# R -e 'rmarkdown::render("07_metrics_across_backends.Rmd", params = list(performance_bn = "performance-results.rds", metrics_bn = "metrics-results.rds", clustering_dir = "."))' +# wget -nc https://github.com/imallona/clustering_report/blob/mark/08_performances_across_backends.Rmd +# R -e 'rmarkdown::render("08_performances_across_backends.Rmd", params = list(performance_bn = "performance-results.rds", metrics_bn = "metrics-results.rds", clustering_dir = "."))' diff --git a/envs/README.md b/envs/README.md index 69aa5c1..cb77a28 100644 --- a/envs/README.md +++ b/envs/README.md @@ -1,8 +1,7 @@ We distribute `Clustering.yml` runs with different backends. - `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip) -- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files. -- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images. +- `Clustering_oras.yml`. Singularity non- to semi-reproducible, prebuilt remote images. - `Clustering_envmodules.yml`. Easybuilt with default optimization. @@ -12,8 +11,6 @@ We distribute `Clustering.yml` runs with different backends. - `clustbench.yml` - `fcps.yml` -- `r.yml` -- `sklearn.yml` ### How to build @@ -25,22 +22,24 @@ No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clusterin - `clustbench_singularity.def` - `fcps_singularity.def` -- `r_singularity.def` -- `sklearn_singularity.def` ### How to build - `build_singularity.sh` +### How to push to renku's gitlab registry + +``` +apptainer push --docker-username janedoe --docker-password glpat-uzh fcps.sif oras://registry.renkulab.io/izaskun.mallona/clustering_example/name:tag +``` + ## Aptainer semi-reproducible and remote No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. ## Apptainer (reproducible) with easybuild -Doing... - -Lorem ipsum. +This is pending work ## envmodules - reproducible builds with easybuild @@ -49,10 +48,10 @@ Lorem ipsum. - `clustbench.eb` - `fcps.eb` -### How to build +### How to build and warnings 1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2 2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this 3. `python3-wget` from pypi doesn't look very well maintaned -4. `eb fcps.eb --robot` -5. `eb clustbench.eb --robot` +4. `eb fcps.eb --robot --ignore-checksums` +5. `eb clustbench.eb --robot --ignore-checksums` diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 86e053f..d51532f 100644 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,9 +1,5 @@ #!/bin/bash -sudo singularity build sklearn.sif sklearn_singularity.def +singularity build clustbench.sif clustbench_singularity.def -sudo singularity build clustbench.sif clustbench_singularity.def - -sudo singularity build r.sif r_singularity.def - -sudo singularity build fcps.sif fcps_singularity.def +singularity build fcps.sif fcps_singularity.def diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 22597fb..e6c2d1a 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -3,7 +3,7 @@ easyblock = 'PythonBundle' name = 'clustbench' -version = '1' +version = '1.6' homepage = 'https://python.org/' description = "Bundle of Python packages for ob clustering_example" @@ -29,43 +29,6 @@ exts_default_options = { 'use_pip' : True } -## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz -## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip -## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz -## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz -## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - -source_urls = [PYPI_SOURCE, - 'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/', - 'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/', - 'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/', - 'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/', - 'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa', - 'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/', - 'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/', - 'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/', - 'https://github.com/pybind/pybind11/archive/', - 'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/', - 'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz', - 'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz', - 'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz', - 'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz', - 'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz', - 'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz', - 'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz'] - - -## caution download genieclust here, not pypi, they differ and pypi's it's not installable! -## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/ -## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz -## todo automate this within the easyconfig! - exts_list = [ ('natsort', '8.4.0', { 'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'], @@ -83,7 +46,8 @@ exts_list = [ 'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'], }), ('genieclust', '1.1.6', { - 'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'], + 'source_urls': ['https://github.com/gagolews/genieclust/archive/refs/tags/'], + 'sources': ['v1.1.6.tar.gz'], }), ('hurry.filesize', '0.9', { 'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'], @@ -92,11 +56,11 @@ exts_list = [ 'modulename': 'wget', 'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'], }), - ('clustering_benchmarks', '1.1.5', { - 'modulename': 'clustbench', - 'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'], - }), -] + ('clustering_benchmarks', '1.1.6', { + 'modulename': 'clustbench', + 'source_urls': ['https://github.com/gagolews/clustering-benchmarks/releases/download/v1.1.6/'], + 'sources': ['clustering_benchmarks-1.1.6.tar.gz'], + }),] sanity_check_paths = { 'files': [], @@ -105,4 +69,3 @@ sanity_check_paths = { moduleclass = 'bio' - diff --git a/envs/fcps.eb b/envs/fcps.eb index ee3db52..dbc9cbf 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -8,7 +8,7 @@ version = '1.3.4' versionsuffix = '-r-%(rver)s' homepage = 'https://bioconductor.org' -description = """Omnibenchmark clustering example FCPS deps.""" +description = """Omnibenchmark clustering example FCPS deps, R 4.3.2 BioC 3.18.""" toolchain = {'name': 'foss', 'version': '2023a'} @@ -28,8 +28,8 @@ exts_default_options = { 'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/', 'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive - 'https://cran.r-project.org/src/contrib/', # current version of packages - 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages + 'https://cran.r-project.org/src/contrib/', # current version of packages + 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages ], 'sources': ['%(name)s_%(version)s.tar.gz'], } @@ -81,8 +81,8 @@ exts_list = [ ('R6', '2.5.1', { 'checksums': ['8d92bd29c2ed7bf15f2778618ffe4a95556193d21d8431a7f75e7e5fc102bf48'], }), - ('jsonlite', '1.8.9', { - 'checksums': ['89f130e0e1163328c01decd54e7712b5ebf3d0a667da0052833722cb9a6e90b0'], + ('jsonlite', '2.0.0', { + 'checksums': ['75eb910c82b350ec33f094779da0f87bff154c232e4ae39c9896a9b89f3ac82d'], }), ('argparse', '2.2.5', { 'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'], @@ -192,13 +192,18 @@ exts_list = [ ('cluster', '2.1.8', { 'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'], }), - ('graph', '1.84.1', { - 'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'], + ('graph', '1.84.0', { + 'source_urls': ['https://mghp.osn.xsede.org/bir190004-bucket01/archive.bioconductor.org/packages/3.20/bioc/src/contrib/Archive/graph/' + 'https://bioconductor.org/packages/3.20/bioc/src/contrib/Archive/graph/'], + 'sources': ['graph_1.84.0.tar.gz'], + 'checksums': ['970bec2341a1c492f5d1cfb41659d9ffaf257a43bff50715fcd0d7cbd6b2073b'], }), ('mclust', '6.1.1', { 'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'], }), - ('cclust', '0.6-26'), + ('cclust', '0.6-26', { + 'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'], + }), ('flowClust', '3.40.0', { 'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'", 'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'], @@ -231,13 +236,61 @@ exts_list = [ ('FCPS', version, { 'checksums': ['d1e5e06700a81fe529f52ef1f65977d3c786f33df262f4f89238d2622dc7ba97'], }), + ('codetools', '0.2-20', { + 'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'], + }), + ('iterators', '1.0.14', { + 'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'], + }), + ('foreach', '1.5.2', { + 'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'], + }), + ('data.table', '1.17.4', { + 'checksums': ['396eb2a0e38249310070b60cc6c4e930f2cfc940e6ad0ac62c4c7a5b16390753'], + }), + ('ModelMetrics', '1.2.2.2', { + 'checksums': ['5e06f1926aebca5654e1329c66ef19b04058376b2277ebb16e3bf8c208d73457'], + }), + ('generics', '0.1.3', { + 'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'], + }), + ('tidyselect', '1.2.1', { + 'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'], + }), + ('dplyr', '1.1.4', { + 'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'], + }), + ('timechange', '0.3.0', { + 'checksums': ['d85c0b5514ab9578d16032e703c33f197feaed1a424c834ebfcbf0ad46ae46b4'], + }), + ('lubridate', '1.9.3', { + 'checksums': ['2b6e1406d231b0a14d60b99cc406d159fea5465a5694725ad25343f12cf37fff'], + }), + ('tidyr', '1.3.1', { + 'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'], + }), + ('jsonlite', '2.0.0', { + 'checksums': ['75eb910c82b350ec33f094779da0f87bff154c232e4ae39c9896a9b89f3ac82d'], + }), + ('R.methodsS3', '1.8.2', { + 'checksums': ['822d5e61dad4c91e8883be2b38d7b89f87492046d0fe345704eb5d2658927c2e'], + }), + ('R.oo', '1.25.0', { + 'checksums': ['b8b19061774918ee7d9d4330c16c0ea505f7cd02d01343df1e8b2e4fb847beef'], + }), + ('cgdsr', '1.3.0', { + 'checksums': ['4aa2a3564cee2449c3ff39ab2ad631deb165d4c78b8107e0ff77a9095340cc1f'], + }), + ('R.utils', '2.12.3', { + 'checksums': ['74d6e77a95a23381a490fea54be01b653d4b938a2dc75e749a694ab48302c40c'], + }), ] modextrapaths = {'R_LIBS_SITE': ''} sanity_check_paths = { 'files': [], - 'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'], + 'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust', 'jsonlite', 'tidyr', 'dplyr'], } moduleclass = 'bio' \ No newline at end of file diff --git a/envs/fcps.yml b/envs/fcps.yml index 877638e..e6ba5ce 100644 --- a/envs/fcps.yml +++ b/envs/fcps.yml @@ -15,3 +15,12 @@ dependencies: - conda-forge::r-cluster==2.1.8 - conda-forge::r-mclust==6.1.1 - r::r-cclust=0.6_26 + - conda-forge::r-rmarkdown=2.29 + - conda-forge::r-cairo=1.6_2 + - conda-forge::r-svglite=2.1.3 + - conda-forge::r-ggplot2=3.5.2 + - conda-forge::r-tidyr=1.3.1 + - bioconda::bioconductor-complexheatmap=2.18.0 + - conda-forge::r-jsonlite=2.0.0 + - conda-forge::r-dplyr=1.1.4 + - conda-forge::r-r.utils=2.13.0 diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity.def index a4a615e..ac69d33 100644 --- a/envs/fcps_singularity.def +++ b/envs/fcps_singularity.def @@ -25,10 +25,13 @@ From: rocker/tidyverse:4.3.3 python3.12 -m venv "default" . default/bin/activate - pip install gitpython==3.1.43 isodate pydantic-core + pip install gitpython==3.1.43 isodate "pydantic-core==2.34.1" - ## no versioning here - Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' + ## no versioning here - fcps deps + Rscript -e 'BiocManager::install(c("kernlab", "cclust", "dbscan", "kohonen", "MCL", "ADPclust", "cluster", "DatabionicSwarm", "orclus", "subspace", "flexclust", "ABCanalysis", "apcluster", "pracma", "EMCluster", "pdfCluster", "parallelDist", "plotly", "ProjectionBasedClustering", "GeneralizedUmatrix", "mstknnclust", "densityClust", "parallel", "energy", "R.utils", "tclust", "Spectrum", "genie", "protoclust", "fastcluster", "clusterability", "signal", "reshape2", "PPCI", "clustrd", "smacof", "rgl", "prclust", "dendextend", "moments", "prabclus", "VarSelLCM", "sparcl", "mixtools", "HDclassif", "clustvarsel", "knitr", "rmarkdown", "R.utils", "FCPS"))' + + ## no versioning here -- plotting deps + Rscript -e 'BiocManager::install(c("rmarkdown", "svglite", "ggplot2", "tidyr", "ComplexHeatmap", "argparse"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT diff --git a/envs/r.yml b/envs/r.yml deleted file mode 100644 index 456e139..0000000 --- a/envs/r.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: r_for_metrics -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::r-mclust - - conda-forge::r-caret - - conda-forge::r-dplyr - - conda-forge::r-readr - - conda-forge::r-argparse - diff --git a/envs/r_singularity.def b/envs/r_singularity.def deleted file mode 100644 index f1f9ec9..0000000 --- a/envs/r_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.4 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - # Install R packages - - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml deleted file mode 100644 index e57969e..0000000 --- a/envs/rmarkdown.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: rmarkdown -channels: - - conda-forge - - bioconda - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::r-argparse - - conda-forge::r-rmarkdown - - conda-forge::r-ggplot2 - - conda-forge::r-tidyr - - bioconda::bioconductor-complexheatmap diff --git a/envs/sklearn.yml b/envs/sklearn.yml deleted file mode 100644 index 258b7ea..0000000 --- a/envs/sklearn.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: sklearn -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::scikit-learn - - conda-forge::pip - - pip: - - "pandas" - - "argparse" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def deleted file mode 100644 index 939a3bb..0000000 --- a/envs/sklearn_singularity.def +++ /dev/null @@ -1,33 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate