Skip to content

Commit 01181cb

Browse files
committed
[Logs] Move the CW Agent configuration authored by ParallelCluster to /etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json.
This is to prevent a race condition in the way we start the CW agent, that may lead to undesired deletion of the config file and eventually the node bootstrap failure caused by CW agent failing to start.
1 parent 778d32a commit 01181cb

File tree

4 files changed

+10
-4
lines changed

4 files changed

+10
-4
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1515
- Upgrade DCV to version 2024.0-19030.
1616
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
1717

18+
**BUG FIXES**
19+
- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures.
20+
1821
3.13.2
1922
------
2023

cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from cloudwatch_agent_common_utils import render_jinja_template
1515

16-
AWS_CLOUDWATCH_CFG_PATH = "/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json"
16+
AWS_CLOUDWATCH_CFG_PATH = "/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json"
1717
DEFAULT_METRICS_COLLECTION_INTERVAL = 60
1818

1919

@@ -45,6 +45,7 @@ def gethostname():
4545

4646
def write_config(config):
4747
"""Write config to AWS_CLOUDWATCH_CFG_PATH."""
48+
os.makedirs(os.path.dirname(AWS_CLOUDWATCH_CFG_PATH), exist_ok=True)
4849
with open(AWS_CLOUDWATCH_CFG_PATH, "w+", encoding="utf-8") as output_config_file:
4950
json.dump(config, output_config_file, indent=4)
5051

cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ def package_path
163163
command "#{cookbook_virtualenv_path}/bin/python #{validator_script_path}"
164164
end unless redhat_on_docker?
165165

166+
CW_AGENT_CONFIG_JSON = '/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json'
167+
166168
execute "cloudwatch-config-creation" do
167169
user 'root'
168170
timeout 300
@@ -182,6 +184,6 @@ def package_path
182184
execute "cloudwatch-agent-start" do
183185
user 'root'
184186
timeout 300
185-
command "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s"
187+
command "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:#{CW_AGENT_CONFIG_JSON} -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:#{CW_AGENT_CONFIG_JSON} -s"
186188
end unless node['cluster']['cw_logging_enabled'] != 'true' || on_docker?
187189
end

cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ def self.configure(chef_run)
255255
is_expected.to run_execute("cloudwatch-agent-start").with(
256256
user: 'root',
257257
timeout: 300,
258-
command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s"
258+
command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s"
259259
)
260260
end
261261
end
@@ -313,7 +313,7 @@ def self.configure(chef_run)
313313
is_expected.to run_execute("cloudwatch-agent-start").with(
314314
user: 'root',
315315
timeout: 300,
316-
command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s"
316+
command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s"
317317
)
318318
end
319319
end

0 commit comments

Comments
 (0)