oracle-quickstart
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎terraform/amd-device-metrics-exporter.tf
Lines changed: 19 additions & 0 deletions b/‎terraform/amd-device-metrics-exporter.tf
Lines changed: 19 additions & 0 deletions
diff --git a/‎terraform/files/amd-device-metrics-exporter/values.yaml
Lines changed: 73 additions & 0 deletions b/‎terraform/files/amd-device-metrics-exporter/values.yaml
Lines changed: 73 additions & 0 deletions
@@ -46,7 +46,7 @@ You can use the instructions [here](https://docs.oracle.com/en-us/iaas/Content/C
 ### Deploy the cluster using the Oracle Cloud Resource Manager template
 You can easily deploy the cluster using the **Deploy to Oracle Cloud** button below.
 
-[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.4.0/oke-rdma-quickstart-v25.4.0.zip)
+[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.5.0/oke-rdma-quickstart-v25.5.0.zip)
 
 For the image ID, use the ID of the image that you imported in the previous step.
 
@@ -262,6 +262,7 @@ If you have a question that is not listed below, you can create an issue in the
 - [I have large container images. Can I import them from a shared location instead of downloading them?](#i-have-large-container-images-can-i-import-them-from-a-shared-location-instead-of-downloading-them)
 - [How can I run GPU & RDMA health checks in my nodes?](#how-can-i-run-gpu--rdma-health-checks-in-my-nodes)
 - [Can I autoscale my RDMA enabled nodes in a Cluster Network?](#can-i-autoscale-my-rdma-enabled-nodes-in-a-cluster-network)
+- [How do I use network locality information when running workloads on OKE?](#how-do-i-use-network-locality-information-when-running-workloads-on-oke)
 
 ### Are there any features that are not supported when using self-managed nodes?
 Some features and capabilities are not available, or not yet available, when using self-managed nodes. Please see [this link](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengworkingwithselfmanagednodes.htm) for a list of features and capabilities that are not available for self-managed nodes.
 
@@ -0,0 +1,19 @@
+# Copyright (c) 2024 Oracle Corporation and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
+
+resource "helm_release" "amd_device_metrics_exporter" {
+  count             = var.install_amd_device_metrics_exporter && var.install_node_problem_detector_kube_prometheus_stack ? 1 : 0
+  depends_on        = [helm_release.prometheus]
+  namespace         = var.monitoring_namespace
+  name              = "amd-device-metrics-exporter"
+  chart             = "device-metrics-exporter-charts"
+  repository        = "https://rocm.github.io/device-metrics-exporter"
+  version           = var.amd_device_metrics_exporter_chart_version
+  values            = ["${file("./files/amd-device-metrics-exporter/values.yaml")}"]
+  create_namespace  = false
+  recreate_pods     = true
+  force_update      = true
+  dependency_update = true
+  wait              = false
+  max_history       = 1
+}
@@ -0,0 +1,73 @@
+platform: k8s
+
+# -- Add node selector for the daemonset of metrics exporter
+nodeSelector:
+  node.kubernetes.io/instance-type: BM.GPU.MI300X.8
+
+# -- Add tolerations for deploying metrics exporter on tainted nodes
+tolerations:
+  - effect: NoSchedule
+    operator: Exists
+
+image:
+  # -- repository URL for the metrics exporter image
+  repository: docker.io/rocm/device-metrics-exporter
+  # -- metrics exporter image tag
+  tag: v1.2.1
+  # -- metrics exporter image pullPolicy
+  pullPolicy: Always
+  # -- metrics exporter image pullSecret name
+  pullSecrets: ""
+  # -- metrics exporter initContainer image
+  initContainerImage: busybox:1.36
+
+service:
+  # -- metrics exporter service type, could be ClusterIP or NodePort
+  type: ClusterIP
+  ClusterIP:
+    # -- set port for ClusterIP type service
+    port: 5000
+  NodePort:
+    # -- set port for NodePort type service
+    port: 5000
+    # -- set nodePort for NodePort type service
+    nodePort: 32500
+
+# -- configMap name for the customizing configs and mount into metrics exporter container
+configMap: ""
+
+# -- ServiceMonitor configuration
+serviceMonitor:
+  # -- Whether to create a ServiceMonitor resource for Prometheus Operator
+  enabled: true
+  # -- Scrape interval for the ServiceMonitor
+  interval: "30s"
+  # -- Honor labels configuration for ServiceMonitor
+  honorLabels: true
+  # -- Honor timestamps configuration for ServiceMonitor
+  honorTimestamps: true
+  # -- Additional labels for the ServiceMonitor
+  attachMetadata:
+    node: true
+  labels:
+    release: kube-prometheus-stack
+  # -- RelabelConfigs to apply to samples before scraping
+  relabelings:
+    - sourceLabels: [__meta_kubernetes_pod_node_name]
+      separator: ;
+      regex: ^(.*)$
+      targetLabel: hostname
+      replacement: $1
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_provider_id]
+      targetLabel: instance_id
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_host_serial_number]
+      targetLabel: host_serial_number
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_label_node_kubernetes_io_instance_type]
+      targetLabel: instance_shape
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_rdma_cluster_id]
+      targetLabel: cluster_name
+      action: replace