Skip to content

Commit 0ce2acc

Browse files
Merge pull request #59 from oracle-quickstart/25.5.0
Add v25.5.0
2 parents 1861ee8 + af7881e commit 0ce2acc

23 files changed

+9659
-16
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ You can use the instructions [here](https://docs.oracle.com/en-us/iaas/Content/C
4646
### Deploy the cluster using the Oracle Cloud Resource Manager template
4747
You can easily deploy the cluster using the **Deploy to Oracle Cloud** button below.
4848

49-
[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.4.0/oke-rdma-quickstart-v25.4.0.zip)
49+
[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.5.0/oke-rdma-quickstart-v25.5.0.zip)
5050

5151
For the image ID, use the ID of the image that you imported in the previous step.
5252

@@ -262,6 +262,7 @@ If you have a question that is not listed below, you can create an issue in the
262262
- [I have large container images. Can I import them from a shared location instead of downloading them?](#i-have-large-container-images-can-i-import-them-from-a-shared-location-instead-of-downloading-them)
263263
- [How can I run GPU & RDMA health checks in my nodes?](#how-can-i-run-gpu--rdma-health-checks-in-my-nodes)
264264
- [Can I autoscale my RDMA enabled nodes in a Cluster Network?](#can-i-autoscale-my-rdma-enabled-nodes-in-a-cluster-network)
265+
- [How do I use network locality information when running workloads on OKE?](#how-do-i-use-network-locality-information-when-running-workloads-on-oke)
265266

266267
### Are there any features that are not supported when using self-managed nodes?
267268
Some features and capabilities are not available, or not yet available, when using self-managed nodes. Please see [this link](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengworkingwithselfmanagednodes.htm) for a list of features and capabilities that are not available for self-managed nodes.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2024 Oracle Corporation and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
3+
4+
resource "helm_release" "amd_device_metrics_exporter" {
5+
count = var.install_amd_device_metrics_exporter && var.install_node_problem_detector_kube_prometheus_stack ? 1 : 0
6+
depends_on = [helm_release.prometheus]
7+
namespace = var.monitoring_namespace
8+
name = "amd-device-metrics-exporter"
9+
chart = "device-metrics-exporter-charts"
10+
repository = "https://rocm.github.io/device-metrics-exporter"
11+
version = var.amd_device_metrics_exporter_chart_version
12+
values = ["${file("./files/amd-device-metrics-exporter/values.yaml")}"]
13+
create_namespace = false
14+
recreate_pods = true
15+
force_update = true
16+
dependency_update = true
17+
wait = false
18+
max_history = 1
19+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
platform: k8s
2+
3+
# -- Add node selector for the daemonset of metrics exporter
4+
nodeSelector:
5+
node.kubernetes.io/instance-type: BM.GPU.MI300X.8
6+
7+
# -- Add tolerations for deploying metrics exporter on tainted nodes
8+
tolerations:
9+
- effect: NoSchedule
10+
operator: Exists
11+
12+
image:
13+
# -- repository URL for the metrics exporter image
14+
repository: docker.io/rocm/device-metrics-exporter
15+
# -- metrics exporter image tag
16+
tag: v1.2.1
17+
# -- metrics exporter image pullPolicy
18+
pullPolicy: Always
19+
# -- metrics exporter image pullSecret name
20+
pullSecrets: ""
21+
# -- metrics exporter initContainer image
22+
initContainerImage: busybox:1.36
23+
24+
service:
25+
# -- metrics exporter service type, could be ClusterIP or NodePort
26+
type: ClusterIP
27+
ClusterIP:
28+
# -- set port for ClusterIP type service
29+
port: 5000
30+
NodePort:
31+
# -- set port for NodePort type service
32+
port: 5000
33+
# -- set nodePort for NodePort type service
34+
nodePort: 32500
35+
36+
# -- configMap name for the customizing configs and mount into metrics exporter container
37+
configMap: ""
38+
39+
# -- ServiceMonitor configuration
40+
serviceMonitor:
41+
# -- Whether to create a ServiceMonitor resource for Prometheus Operator
42+
enabled: true
43+
# -- Scrape interval for the ServiceMonitor
44+
interval: "30s"
45+
# -- Honor labels configuration for ServiceMonitor
46+
honorLabels: true
47+
# -- Honor timestamps configuration for ServiceMonitor
48+
honorTimestamps: true
49+
# -- Additional labels for the ServiceMonitor
50+
attachMetadata:
51+
node: true
52+
labels:
53+
release: kube-prometheus-stack
54+
# -- RelabelConfigs to apply to samples before scraping
55+
relabelings:
56+
- sourceLabels: [__meta_kubernetes_pod_node_name]
57+
separator: ;
58+
regex: ^(.*)$
59+
targetLabel: hostname
60+
replacement: $1
61+
action: replace
62+
- sourceLabels: [__meta_kubernetes_node_provider_id]
63+
targetLabel: instance_id
64+
action: replace
65+
- sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_host_serial_number]
66+
targetLabel: host_serial_number
67+
action: replace
68+
- sourceLabels: [__meta_kubernetes_node_label_node_kubernetes_io_instance_type]
69+
targetLabel: instance_shape
70+
action: replace
71+
- sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_rdma_cluster_id]
72+
targetLabel: cluster_name
73+
action: replace

0 commit comments

Comments
 (0)