Skip to content

Commit cff5645

Browse files
committed
Add readme for deploying collector to ECS EC2
1 parent b1cd0a2 commit cff5645

File tree

3 files changed

+454
-0
lines changed

3 files changed

+454
-0
lines changed
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# This collector config file is designed for use within an ECS task.
2+
# The collector should run in a sidecar container within an ECS task.
3+
config_sources:
4+
env:
5+
defaults:
6+
METRICS_TO_EXCLUDE: []
7+
ECS_METADATA_EXCLUDED_IMAGES: []
8+
9+
extensions:
10+
health_check:
11+
endpoint: 0.0.0.0:13133
12+
http_forwarder:
13+
ingress:
14+
endpoint: 0.0.0.0:6060
15+
egress:
16+
endpoint: "https://api.${SPLUNK_REALM}.signalfx.com"
17+
zpages:
18+
endpoint: 0.0.0.0:55679
19+
20+
receivers:
21+
hostmetrics:
22+
collection_interval: 10s
23+
scrapers:
24+
cpu:
25+
disk:
26+
filesystem:
27+
memory:
28+
network:
29+
# System load average metrics https://en.wikipedia.org/wiki/Load_(computing)
30+
load:
31+
# Paging/Swap space utilization and I/O metrics
32+
paging:
33+
# Aggregated system process count metrics
34+
processes:
35+
# System processes metrics, disabled by default
36+
# process:
37+
jaeger:
38+
protocols:
39+
grpc:
40+
endpoint: 0.0.0.0:14250
41+
thrift_http:
42+
endpoint: 0.0.0.0:14268
43+
otlp:
44+
protocols:
45+
grpc:
46+
endpoint: 0.0.0.0:4317
47+
http:
48+
endpoint: 0.0.0.0:55681
49+
# This section is used to collect the OpenTelemetry Collector metrics
50+
# Even if just a Splunk APM customer, these metrics are included
51+
prometheus/internal:
52+
config:
53+
scrape_configs:
54+
- job_name: 'otel-collector'
55+
scrape_interval: 10s
56+
static_configs:
57+
- targets: ['0.0.0.0:8888']
58+
metric_relabel_configs:
59+
- source_labels: [__name__]
60+
regex: '.*grpc_io.*'
61+
action: drop
62+
signalfx:
63+
endpoint: 0.0.0.0:9943
64+
zipkin:
65+
endpoint: 0.0.0.0:9411
66+
smartagent/signalfx-forwarder:
67+
type: signalfx-forwarder
68+
listenAddress: 0.0.0.0:9080
69+
smartagent/ecs-metadata:
70+
type: ecs-metadata
71+
metadataEndpoint: "${ECS_TASK_METADATA_ENDPOINT}"
72+
statsEndpoint: "${ECS_TASK_STATS_ENDPOINT}"
73+
excludedImages: ${env:ECS_METADATA_EXCLUDED_IMAGES}
74+
75+
processors:
76+
batch:
77+
# Enabling the memory_limiter is strongly recommended for every pipeline.
78+
# Configuration is based on the amount of memory allocated to the collector.
79+
# In general, the ballast should be set to 1/3 of the collector's memory, the limit
80+
# should be 90% of the collector's memory. The simplest way to specify the
81+
# ballast size is set the value of SPLUNK_BALLAST_SIZE_MIB env variable. Alternatively, the
82+
# --mem-ballast-size-mib command line flag can be passed and take priority.
83+
# For more information about memory limiter, see
84+
# https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiter/README.md
85+
memory_limiter:
86+
check_interval: 2s
87+
limit_mib: ${SPLUNK_MEMORY_LIMIT_MIB}
88+
# detect if the collector is running on a cloud system
89+
# important for creating unique cloud provider dimensions
90+
resourcedetection:
91+
detectors: [ecs]
92+
override: false
93+
# Same as above but overrides resource attributes set by receivers
94+
resourcedetection/internal:
95+
detectors: [ecs]
96+
override: true
97+
# Enables the filter processor with example settings
98+
# Full configuration here: https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/filterprocessor
99+
# NOTE: These settings need to be change when using this processor
100+
filter/1:
101+
metrics:
102+
exclude:
103+
match_type: regexp
104+
metric_names: ${env:METRICS_TO_EXCLUDE}
105+
# # Optional: The following processor can be used to add a default "deployment.environment" attribute to the logs and
106+
# # traces when it's not populated by instrumentation libraries.
107+
# # If enabled, make sure to enable this processor in the pipeline below.
108+
# resource/add_environment:
109+
# attributes:
110+
# - action: insert
111+
# value: staging/production/...
112+
# key: deployment.environment
113+
114+
exporters:
115+
# Traces
116+
sapm:
117+
access_token: "${SPLUNK_ACCESS_TOKEN}"
118+
endpoint: "https://ingest.${SPLUNK_REALM}.signalfx.com/v2/trace"
119+
# Metrics + Events
120+
signalfx:
121+
access_token: "${SPLUNK_ACCESS_TOKEN}"
122+
realm: "${SPLUNK_REALM}"
123+
correlation:
124+
# # Logs
125+
# splunk_hec:
126+
# token: "${SPLUNK_HEC_TOKEN}"
127+
# endpoint: "${SPLUNK_HEC_URL}"
128+
# source: "otel"
129+
# sourcetype: "otel"
130+
131+
service:
132+
extensions: [health_check, http_forwarder, zpages]
133+
pipelines:
134+
traces:
135+
receivers: [jaeger, otlp, zipkin, smartagent/signalfx-forwarder]
136+
processors:
137+
- memory_limiter
138+
- batch
139+
- resourcedetection
140+
# - resource/add_environment
141+
exporters: [sapm, signalfx]
142+
metrics:
143+
receivers: [hostmetrics, signalfx, smartagent/signalfx-forwarder, smartagent/ecs-metadata]
144+
processors: [memory_limiter, batch, resourcedetection]
145+
exporters: [signalfx]
146+
metrics/internal:
147+
receivers: [prometheus/internal]
148+
processors: [memory_limiter, batch, resourcedetection/internal]
149+
exporters: [signalfx]
150+
# logs:
151+
# receivers: [otlp]
152+
# processors:
153+
# - memory_limiter
154+
# - batch
155+
# - resourcedetection
156+
# #- resource/add_environment
157+
# exporters: [splunk_hec]
158+
# # Use instead when sending to gateway
159+
# #exporters: [otlp]
160+
161+

deployments/ecs/ec2/README.md

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Amazon ECS EC2 Deployment
2+
Familiarity with Amazon ECS using launch type EC2 is assumed. Consult the
3+
[Getting started with the Amazon ECS console using Amazon EC2](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/getting-started-ecs-ec2.html)
4+
for further reading.
5+
6+
The
7+
[Splunk OpenTelemetry Connector](https://github.com/signalfx/splunk-otel-collector)
8+
(Collector) should to be run as a Daemon service in an EC2 ECS cluster.
9+
10+
Requires Connector release v0.34.0 or newer which corresponds to image tag 0.34.0 and newer.
11+
See image repository [here](https://quay.io/repository/signalfx/splunk-otel-collector?tab=tags).
12+
13+
## Getting Started
14+
### Create Task Definition
15+
Take the task definition JSON for the Collector [here](./splunk-otel-collector.json), replace
16+
`MY_SPLUNK_ACCESS_TOKEN` and `MY_SPLUNK_REALM` with valid values. Update the image tag to
17+
the newest version. Use the JSON to create a task definition of **EC2 launch type** following
18+
the instructions [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/create-task-definition.html).
19+
20+
The Collector is configured to use the default configuration file `/etc/otel/collector/ecs_ec2_config.yaml`.
21+
The Collector image Dockerfile is available [here](../../../cmd/otelcol/Dockerfile) and the contents of the default
22+
configuration file can be seen [here](../../../cmd/otelcol/config/collector/ecs_ec2_config.yaml).
23+
24+
**Note**: You do not need the `smartagent/ecs-metadata` metrics receiver in the default
25+
configuration file if all you want is tracing. You can take the default configuration, remove
26+
the receiver, then use the configuration in a custom configuration following the direction
27+
in the [custom configuration](#custom-configuration) section.
28+
29+
The configured network mode for the task is **host**. This means that **task metadata endpoint
30+
version 2** used by receiver `smartagent/ecs-metadata` is not enabled by default. See
31+
[here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html)
32+
if **task metadata endpoint version 3** is enabled by default for your task. If enabled add the
33+
following to the **environment** list in the task definition JSON:
34+
```json
35+
{
36+
"name": "ECS_TASK_METADATA_ENDPOINT",
37+
"value": "${ECS_CONTAINER_METADATA_URI}/task"
38+
},
39+
{
40+
"name": "ECS_TASK_STATS_ENDPOINT",
41+
"value": "${ECS_CONTAINER_METADATA_URI}/task/stats"
42+
}
43+
```
44+
45+
Assign a stringified array of metrics you want excluded to environment variable
46+
`METRICS_TO_EXCLUDE`. You can set the memory limit for the memory limiter processor using
47+
environment variable `SPLUNK_MEMORY_LIMIT_MIB`. The default memory limit is 512 MiB. For
48+
more information about the memory limiter processor, see
49+
[here](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiter/README.md)
50+
51+
### Launch the Collector
52+
The Collector is designed to be run as a Daemon service in an EC2 ECS cluster.
53+
54+
To create a Collector service from the Amazon ECS console:
55+
56+
Go to your cluster in the console
57+
1. Click on the "Services" tab.
58+
2. Click "Create" at the top of the tab.
59+
3. Select:
60+
- Launch Type -> EC2
61+
- Task Definition (Family) -> splunk-otel-collector
62+
- Task Definition (Revision) -> 1 (or whatever the latest is in your case)
63+
- Service Name -> splunk-otel-collector
64+
- Service type -> DAEMON
65+
4. Leave everything else at default and click "Next step"
66+
5. Leave everything on this next page at their defaults and click "Next step".
67+
6. Leave everything on this next page at their defaults and click "Next step".
68+
7. Click "Create Service" and the collector should be deployed onto each node in the ECS cluster. You should see infrastructure and docker metrics flowing soon.
69+
70+
## Custom Configuration
71+
To use a custom configuration file, replace the value of environment variable
72+
`SPLUNK_CONFIG` with the file path of the custom configuration file in Collector
73+
task definition.
74+
75+
Alternatively, you can specify the custom configuration YAML directly using environment
76+
variable `SPLUNK_CONFIG_YAML` as describe [below](#direct-configuration).
77+
78+
### ecs_observer
79+
Use extension
80+
[Amazon Elastic Container Service Observer](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/observer/ecsobserver#amazon-elastic-container-service-observer)
81+
(`ecs_observer`) in your custom configuration to discover metrics targets
82+
in running tasks, filtered by service names, task definitions and container labels.
83+
`ecs_observer` is currently limited to Prometheus targets and requires the read-only
84+
permissions below. You can add the permissions to the task role by adding them to a
85+
customer-managed policy that is attached to the task role.
86+
```text
87+
ecs:List*
88+
ecs:Describe*
89+
```
90+
91+
Below is an example of a custom configuration in which the `ecs_observer` is configured to find
92+
Prometheus targets in cluster `lorem-ipsum-cluster`, region `us-west-2`, where the task ARN
93+
pattern is `^arn:aws:ecs:us-west-2:906383545488:task-definition/lorem-ipsum-task:[0-9]+$`.
94+
The results are written to file `/etc/ecs_sd_targets.yaml`. The `prometheus` receiver is
95+
configured to read targets from the results file. The values for `access_token`
96+
and `realm` are read from environment variables `SPLUNK_ACCESS_TOKEN` and `SPLUNK_REALM`
97+
respectively, which must be specified in your container definition.
98+
99+
```yaml
100+
extensions:
101+
ecs_observer:
102+
refresh_interval: 10s
103+
cluster_name: 'lorem-ipsum-cluster'
104+
cluster_region: 'us-west-2'
105+
result_file: '/etc/ecs_sd_targets.yaml'
106+
task_definitions:
107+
- arn_pattern: "^arn:aws:ecs:us-west-2:906383545488:task-definition/lorem-ipsum-task:[0-9]+$"
108+
metrics_ports: [9113]
109+
metrics_path: /metrics
110+
receivers:
111+
prometheus:
112+
config:
113+
scrape_configs:
114+
- job_name: 'lorem-ipsum-nginx'
115+
scrape_interval: 10s
116+
file_sd_configs:
117+
- files:
118+
- '/etc/ecs_sd_targets.yaml'
119+
processors:
120+
batch:
121+
resourcedetection:
122+
detectors: [ecs]
123+
override: false
124+
exporters:
125+
signalfx:
126+
access_token: ${SPLUNK_ACCESS_TOKEN}
127+
realm: ${SPLUNK_REALM}
128+
service:
129+
extensions: [ecs_observer]
130+
pipelines:
131+
metrics:
132+
receivers: [prometheus]
133+
processors: [batch, resourcedetection]
134+
exporters: [signalfx]
135+
```
136+
137+
### Direct Configuration
138+
The Collector provides environment variable `SPLUNK_CONFIG_YAML` for specifying the
139+
configuration YAML directly which can be used instead of `SPLUNK_CONFIG`.
140+
141+
For example, you can store the custom configuration above in a parameter called
142+
`splunk-otel-collector-config` in **AWS Systems Manager Parameter Store**. Then
143+
assign the parameter to environment variable `SPLUNK_CONFIG_YAML` using `valueFrom`.
144+
145+
**Note:** You should add policy `AmazonSSMReadOnlyAccess` to the task role in order for
146+
the task to have read access to the Parameter Store.

0 commit comments

Comments
 (0)