Add readme for deploying collector to ECS EC2

bjsignalfx · bjsignalfx · commit cff5645283bd · 2021-08-27T14:27:20.000-04:00
diff --git a/cmd/otelcol/config/collector/ecs_ec2_config.yaml b/cmd/otelcol/config/collector/ecs_ec2_config.yaml
@@ -0,0 +1,161 @@
+# This collector config file is designed for use within an ECS task.
+# The collector should run in a sidecar container within an ECS task.
+config_sources:
+  env:
+    defaults:
+      METRICS_TO_EXCLUDE: []
+      ECS_METADATA_EXCLUDED_IMAGES: []
+
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  http_forwarder:
+    ingress:
+      endpoint: 0.0.0.0:6060
+    egress:
+      endpoint: "https://api.${SPLUNK_REALM}.signalfx.com"
+  zpages:
+    endpoint: 0.0.0.0:55679
+
+receivers:
+  hostmetrics:
+    collection_interval: 10s
+    scrapers:
+      cpu:
+      disk:
+      filesystem:
+      memory:
+      network:
+      # System load average metrics https://en.wikipedia.org/wiki/Load_(computing)
+      load:
+      # Paging/Swap space utilization and I/O metrics
+      paging:
+      # Aggregated system process count metrics
+      processes:
+      # System processes metrics, disabled by default
+      # process:
+  jaeger:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:14250
+      thrift_http:
+        endpoint: 0.0.0.0:14268
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:55681
+  # This section is used to collect the OpenTelemetry Collector metrics
+  # Even if just a Splunk APM customer, these metrics are included
+  prometheus/internal:
+    config:
+      scrape_configs:
+        - job_name: 'otel-collector'
+          scrape_interval: 10s
+          static_configs:
+            - targets: ['0.0.0.0:8888']
+          metric_relabel_configs:
+            - source_labels: [__name__]
+              regex: '.*grpc_io.*'
+              action: drop
+  signalfx:
+    endpoint: 0.0.0.0:9943
+  zipkin:
+    endpoint: 0.0.0.0:9411
+  smartagent/signalfx-forwarder:
+    type: signalfx-forwarder
+    listenAddress: 0.0.0.0:9080
+  smartagent/ecs-metadata:
+    type: ecs-metadata
+    metadataEndpoint: "${ECS_TASK_METADATA_ENDPOINT}"
+    statsEndpoint: "${ECS_TASK_STATS_ENDPOINT}"
+    excludedImages: ${env:ECS_METADATA_EXCLUDED_IMAGES}
+
+processors:
+  batch:
+  # Enabling the memory_limiter is strongly recommended for every pipeline.
+  # Configuration is based on the amount of memory allocated to the collector.
+  # In general, the ballast should be set to 1/3 of the collector's memory, the limit
+  # should be 90% of the collector's memory. The simplest way to specify the
+  # ballast size is set the value of SPLUNK_BALLAST_SIZE_MIB env variable. Alternatively, the
+  # --mem-ballast-size-mib command line flag can be passed and take priority.
+  # For more information about memory limiter, see
+  # https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiter/README.md
+  memory_limiter:
+    check_interval: 2s
+    limit_mib: ${SPLUNK_MEMORY_LIMIT_MIB}
+  # detect if the collector is running on a cloud system
+  # important for creating unique cloud provider dimensions
+  resourcedetection:
+    detectors: [ecs]
+    override: false
+  # Same as above but overrides resource attributes set by receivers
+  resourcedetection/internal:
+    detectors: [ecs]
+    override: true
+  # Enables the filter processor with example settings
+  # Full configuration here: https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/filterprocessor
+  # NOTE: These settings need to be change when using this processor
+  filter/1:
+    metrics:
+      exclude:
+        match_type: regexp
+        metric_names: ${env:METRICS_TO_EXCLUDE}
+#  # Optional: The following processor can be used to add a default "deployment.environment" attribute to the logs and
+#  # traces when it's not populated by instrumentation libraries.
+#  # If enabled, make sure to enable this processor in the pipeline below.
+#  resource/add_environment:
+#    attributes:
+#      - action: insert
+#        value: staging/production/...
+#        key: deployment.environment
+
+exporters:
+  # Traces
+  sapm:
+    access_token: "${SPLUNK_ACCESS_TOKEN}"
+    endpoint: "https://ingest.${SPLUNK_REALM}.signalfx.com/v2/trace"
+  # Metrics + Events
+  signalfx:
+    access_token: "${SPLUNK_ACCESS_TOKEN}"
+    realm: "${SPLUNK_REALM}"
+    correlation:
+#  # Logs
+#  splunk_hec:
+#    token: "${SPLUNK_HEC_TOKEN}"
+#    endpoint: "${SPLUNK_HEC_URL}"
+#    source: "otel"
+#    sourcetype: "otel"
+
+service:
+  extensions: [health_check, http_forwarder, zpages]
+  pipelines:
+    traces:
+      receivers: [jaeger, otlp, zipkin, smartagent/signalfx-forwarder]
+      processors:
+        - memory_limiter
+        - batch
+        - resourcedetection
+      #        - resource/add_environment
+      exporters: [sapm, signalfx]
+    metrics:
+      receivers: [hostmetrics, signalfx, smartagent/signalfx-forwarder, smartagent/ecs-metadata]
+      processors: [memory_limiter, batch, resourcedetection]
+      exporters: [signalfx]
+    metrics/internal:
+      receivers: [prometheus/internal]
+      processors: [memory_limiter, batch, resourcedetection/internal]
+      exporters: [signalfx]
+#    logs:
+#      receivers: [otlp]
+#      processors:
+#        - memory_limiter
+#        - batch
+#        - resourcedetection
+#      #- resource/add_environment
+#      exporters: [splunk_hec]
+#      # Use instead when sending to gateway
+#      #exporters: [otlp]
+
+
diff --git a/deployments/ecs/ec2/README.md b/deployments/ecs/ec2/README.md
@@ -0,0 +1,146 @@
+# Amazon ECS EC2 Deployment
+Familiarity with Amazon ECS using launch type EC2 is assumed. Consult the 
+[Getting started with the Amazon ECS console using Amazon EC2](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/getting-started-ecs-ec2.html)
+for further reading.
+
+The
+[Splunk OpenTelemetry Connector](https://github.com/signalfx/splunk-otel-collector)
+(Collector) should to be run as a Daemon service in an EC2 ECS cluster.
+
+Requires Connector release v0.34.0 or newer which corresponds to image tag 0.34.0 and newer.
+See image repository [here](https://quay.io/repository/signalfx/splunk-otel-collector?tab=tags).
+
+## Getting Started
+### Create Task Definition
+Take the task definition JSON for the Collector [here](./splunk-otel-collector.json), replace
+`MY_SPLUNK_ACCESS_TOKEN` and `MY_SPLUNK_REALM` with valid values. Update the image tag to
+the newest version. Use the JSON to create a task definition of **EC2 launch type** following
+the instructions [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/create-task-definition.html).
+
+The Collector is configured to use the default configuration file `/etc/otel/collector/ecs_ec2_config.yaml`.
+The Collector image Dockerfile is available [here](../../../cmd/otelcol/Dockerfile) and the contents of the default
+configuration file can be seen [here](../../../cmd/otelcol/config/collector/ecs_ec2_config.yaml).
+
+**Note**: You do not need the `smartagent/ecs-metadata` metrics receiver in the default
+configuration file if all you want is tracing. You can take the default configuration, remove
+the receiver, then use the configuration in a custom configuration following the direction
+in the [custom configuration](#custom-configuration) section.
+
+The configured network mode for the task is **host**. This means that **task metadata endpoint
+version 2** used by receiver `smartagent/ecs-metadata` is not enabled by default. See
+[here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html)
+if **task metadata endpoint version 3** is enabled by default for your task. If enabled add the
+following to the **environment** list in the task definition JSON:
+```json
+{
+  "name": "ECS_TASK_METADATA_ENDPOINT",
+  "value": "${ECS_CONTAINER_METADATA_URI}/task"
+},
+{
+  "name": "ECS_TASK_STATS_ENDPOINT",
+  "value": "${ECS_CONTAINER_METADATA_URI}/task/stats"
+}
+```
+
+Assign a stringified array of metrics you want excluded to environment variable
+`METRICS_TO_EXCLUDE`. You can set the memory limit for the memory limiter processor using
+environment variable `SPLUNK_MEMORY_LIMIT_MIB`. The default memory limit is 512 MiB. For
+more information about the memory limiter processor, see
+[here](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiter/README.md)
+
+### Launch the Collector
+The Collector is designed to be run as a Daemon service in an EC2 ECS cluster.
+
+To create a Collector service from the Amazon ECS console:
+
+Go to your cluster in the console
+1. Click on the "Services" tab.
+2. Click "Create" at the top of the tab.
+3. Select:
+   - Launch Type -> EC2
+   - Task Definition (Family) -> splunk-otel-collector
+   - Task Definition (Revision) -> 1 (or whatever the latest is in your case)
+   - Service Name -> splunk-otel-collector
+   - Service type -> DAEMON
+4. Leave everything else at default and click "Next step"
+5. Leave everything on this next page at their defaults and click "Next step". 
+6. Leave everything on this next page at their defaults and click "Next step". 
+7. Click "Create Service" and the collector should be deployed onto each node in the ECS cluster. You should see infrastructure and docker metrics flowing soon.
+
+## Custom Configuration
+To use a custom configuration file, replace the value of environment variable
+`SPLUNK_CONFIG` with the file path of the custom configuration file in Collector
+task definition.
+
+Alternatively, you can specify the custom configuration YAML directly using environment
+variable `SPLUNK_CONFIG_YAML` as describe [below](#direct-configuration).
+
+### ecs_observer
+Use extension
+[Amazon Elastic Container Service Observer](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/observer/ecsobserver#amazon-elastic-container-service-observer)
+(`ecs_observer`) in your custom configuration to discover metrics targets
+in running tasks, filtered by service names, task definitions and container labels.
+`ecs_observer` is currently limited to Prometheus targets and requires the read-only
+permissions below. You can add the permissions to the task role by adding them to a 
+customer-managed policy that is attached to the task role.
+```text
+ecs:List*
+ecs:Describe*
+```
+
+Below is an example of a custom configuration in which the `ecs_observer` is configured to find
+Prometheus targets in cluster `lorem-ipsum-cluster`, region `us-west-2`, where the task ARN
+pattern is `^arn:aws:ecs:us-west-2:906383545488:task-definition/lorem-ipsum-task:[0-9]+$`.
+The results are written to file `/etc/ecs_sd_targets.yaml`. The `prometheus` receiver is
+configured to read targets from the results file. The values for `access_token`
+and `realm` are read from environment variables `SPLUNK_ACCESS_TOKEN` and `SPLUNK_REALM`
+respectively, which must be specified in your container definition.
+
+```yaml
+extensions:
+  ecs_observer:
+    refresh_interval: 10s
+    cluster_name: 'lorem-ipsum-cluster'
+    cluster_region: 'us-west-2'
+    result_file: '/etc/ecs_sd_targets.yaml'
+    task_definitions:
+      - arn_pattern: "^arn:aws:ecs:us-west-2:906383545488:task-definition/lorem-ipsum-task:[0-9]+$"
+        metrics_ports: [9113]
+        metrics_path: /metrics
+receivers:
+  prometheus:
+    config:
+      scrape_configs:
+        - job_name: 'lorem-ipsum-nginx'
+          scrape_interval: 10s
+          file_sd_configs:
+            - files:
+                - '/etc/ecs_sd_targets.yaml'
+processors:
+  batch:
+  resourcedetection:
+    detectors: [ecs]
+    override: false    
+exporters:
+  signalfx:
+    access_token: ${SPLUNK_ACCESS_TOKEN}
+    realm: ${SPLUNK_REALM}
+service:
+  extensions: [ecs_observer]
+  pipelines:
+    metrics:
+      receivers: [prometheus]
+      processors: [batch, resourcedetection]
+      exporters: [signalfx]
+```
+
+### Direct Configuration
+The Collector provides environment variable `SPLUNK_CONFIG_YAML` for specifying the
+configuration YAML directly which can be used instead of `SPLUNK_CONFIG`.
+
+For example, you can store the custom configuration above in a parameter called
+`splunk-otel-collector-config` in **AWS Systems Manager Parameter Store**. Then
+assign the parameter to environment variable `SPLUNK_CONFIG_YAML` using `valueFrom`.
+
+**Note:** You should add policy `AmazonSSMReadOnlyAccess` to the task role in order for
+the task to have read access to the Parameter Store.
diff --git a/deployments/ecs/ec2/splunk-otel-collector.json b/deployments/ecs/ec2/splunk-otel-collector.json