Skip to content

Commit 9e9a81a

Browse files
Fix example hpo (#503)
* Use aiaccel-job. * Add job_config. * Fix job_config. * Fix code. * Fix aiaccel-job command. * Fix README. * Remove unnecessary files.
1 parent ef5f090 commit 9e9a81a

File tree

5 files changed

+83
-108
lines changed

5 files changed

+83
-108
lines changed

examples/hpo/optuna/samplers/coco/README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,21 @@
4040
- For details, please refer to the following GitHub repository:
4141
https://github.com/numbbo/coco
4242

43-
- Please replace the virtual environment path in `objective.sh` and the `job_group` in `main_parallel_coco.py` with the appropriate paths and IDs.
43+
- Please replace the # activate environment and the `job_group` in `job_config.py` with the appropriate commands and IDs.
4444
- When you run `main_parallel.py`, the validation for each sampler will be executed.
45-
- The results are saved in `optuna_csv` and `step_csv` under each directory.
4645

47-
cd nelder-mead
48-
python main_parallel.py
46+
```
47+
aiaccel-job pbs --config job_config.yaml gpu --walltime 4:00:00 main_parallel_coco.log -- python3.13 main_parallel_coco.py
48+
```
49+
50+
- The results are saved in `optuna_csv` and `step_csv` under each directory.
4951

5052
- To run `plot.py`, you need to install pandas and matplotlib.
5153

54+
```
5255
pip install pandas matplotlib
5356
python plot.py
57+
```
5458

5559
## 3. Checking the Results
5660

examples/hpo/optuna/samplers/coco/README_ja.md

Lines changed: 0 additions & 63 deletions
This file was deleted.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
walltime: "1:0:0"
2+
3+
script_prologue: |
4+
echo Job ID: $PBS_JOBID
5+
echo Hostname: $(hostname)
6+
7+
export NVIDIA_VISIBLE_DEVICES=all
8+
export JOB_GROUP=job_group
9+
10+
# activate environment
11+
12+
qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1"
13+
14+
cpu:
15+
qsub_args: "-q rt_HF -l select=1"
16+
job: "{command}"
17+
18+
cpu-array:
19+
n_tasks_per_proc: 128
20+
n_procs: 24
21+
qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
22+
job: "{command}"
23+
24+
gpu:
25+
qsub_args: "-q rt_HF -l select=1"
26+
job: "{command}"
27+
28+
gpu-array:
29+
n_tasks_per_proc: 1
30+
n_procs: 1
31+
qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
32+
job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}"
33+
34+
mpi:
35+
n_nodes: 1
36+
qsub_args: >-
37+
-q rt_HF
38+
-l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} ))
39+
job: |
40+
source /etc/profile.d/modules.sh
41+
module load hpcx
42+
43+
mpirun -np {args.n_procs} -bind-to none -map-by slot \\
44+
-mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
45+
{command}
46+
47+
train:
48+
qsub_args: >-
49+
-q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF )
50+
-l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 )
51+
job: |
52+
source /etc/profile.d/modules.sh
53+
module load hpcx
54+
55+
mpirun -np {args.n_gpus} -bind-to none -map-by slot \\
56+
-mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
57+
-x MAIN_ADDR=$(hostname -i) \\
58+
-x MAIN_PORT=3000 \\
59+
-x COLUMNS=120 \\
60+
-x PYTHONUNBUFFERED=true \\
61+
{command}

examples/hpo/optuna/samplers/coco/main_parallel_coco.py

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
1+
from concurrent.futures import ThreadPoolExecutor
12
from itertools import product
2-
from pathlib import Path
3-
4-
from aiaccel.job.jobs.abci_job import AbciJob
3+
import subprocess
54

65

76
def main() -> None:
8-
job_filename = Path("objective.sh")
9-
job_group = "xxx"
10-
117
sampler_names = ["nelder-mead", "nelder-mead-subTPE", "TPE"]
128
func_ids = list(range(1, 25))
139
dims = [2, 3, 5, 10, 20, 40]
@@ -19,30 +15,18 @@ def main() -> None:
1915
sampler_names, func_ids, zip(dims, execute_times, strict=False), zip(instances, optuna_seeds, strict=False)
2016
)
2117

22-
for sampler_name, func_id, (dim, execute_time), (instance, optuna_seed) in combinations:
23-
execute_time = "0:05:00" if sampler_name == "nelder-mead" else execute_time
24-
print(sampler_name, (func_id, execute_time), dim, (instance, optuna_seed))
25-
job = AbciJob(
26-
job_filename,
27-
job_group,
28-
qsub_args=[
29-
"-l",
30-
f"h_rt={execute_time}",
31-
],
32-
args=[
33-
"--func_id",
34-
f"{func_id}",
35-
"--dim",
36-
f"{dim}",
37-
"--instance",
38-
f"{instance}",
39-
"--optuna_seed",
40-
f"{optuna_seed}",
41-
"--sampler_name",
42-
f"{sampler_name}",
43-
],
44-
)
45-
job.submit()
18+
with ThreadPoolExecutor() as pool:
19+
for sampler_name, func_id, (dim, execute_time), (instance, optuna_seed) in combinations:
20+
execute_time = "0:05:00" if sampler_name == "nelder-mead" else execute_time
21+
print(sampler_name, (func_id, execute_time), dim, (instance, optuna_seed))
22+
23+
aiaccel_job_command = f"""\
24+
aiaccel-job pbs --config job_config.yaml cpu --walltime {execute_time} log/job_{func_id}_{dim}_{instance}.log \
25+
-- python3.13 experiment_coco.py --func_id {func_id} --dim {dim} \
26+
--instance {instance} --optuna_seed {optuna_seed} --sampler_name {sampler_name}
27+
"""
28+
29+
pool.submit(subprocess.run, aiaccel_job_command, shell=True)
4630

4731

4832
if __name__ == "__main__":

examples/hpo/optuna/samplers/coco/objective.sh

Lines changed: 0 additions & 11 deletions
This file was deleted.

0 commit comments

Comments
 (0)