@@ -16,7 +16,9 @@ import (
16
16
v2 "github.com/elastic/beats/v7/filebeat/input/v2"
17
17
"github.com/elastic/beats/v7/libbeat/beat"
18
18
"github.com/elastic/beats/v7/libbeat/common/backoff"
19
+ "github.com/elastic/beats/v7/libbeat/management/status"
19
20
"github.com/elastic/beats/v7/libbeat/statestore"
21
+ "github.com/elastic/beats/v7/x-pack/libbeat/statusreporterhelper"
20
22
"github.com/elastic/elastic-agent-libs/logp"
21
23
"github.com/elastic/go-concert/timed"
22
24
)
@@ -37,6 +39,9 @@ type s3PollerInput struct {
37
39
s3ObjectHandler s3ObjectHandlerFactory
38
40
states * states
39
41
filterProvider * filterProvider
42
+
43
+ // health status reporting
44
+ status status.StatusReporter
40
45
}
41
46
42
47
func newS3PollerInput (
@@ -62,21 +67,32 @@ func (in *s3PollerInput) Run(
62
67
inputContext v2.Context ,
63
68
pipeline beat.Pipeline ,
64
69
) error {
70
+
65
71
in .log = inputContext .Logger .Named ("s3" )
72
+
73
+ in .status = statusreporterhelper .New (inputContext .StatusReporter , in .log , "S3" )
74
+ defer in .status .UpdateStatus (status .Stopped , "" )
75
+ in .status .UpdateStatus (status .Starting , "Input starting" )
76
+
66
77
in .pipeline = pipeline
67
78
var err error
68
79
69
80
// Load the persistent S3 polling state.
70
81
in .states , err = newStates (in .log , in .store , in .config .BucketListPrefix )
71
82
if err != nil {
72
- return fmt .Errorf ("can not start persistent store: %w" , err )
83
+ err = fmt .Errorf ("can not start persistent store: %w" , err )
84
+ in .status .UpdateStatus (status .Failed , fmt .Sprintf ("Setup failure: %s" , err .Error ()))
85
+ return err
73
86
}
74
87
defer in .states .Close ()
75
88
76
89
ctx := v2 .GoContextFromCanceler (inputContext .Cancelation )
90
+ in .status .UpdateStatus (status .Configuring , "Configuring input" )
77
91
in .s3 , err = in .createS3API (ctx )
78
92
if err != nil {
79
- return fmt .Errorf ("failed to create S3 API: %w" , err )
93
+ err = fmt .Errorf ("failed to create S3 API for bucket ARN '%s': Error: %w" , in .config .getBucketARN (), err )
94
+ in .status .UpdateStatus (status .Failed , fmt .Sprintf ("Setup failure: %s" , err .Error ()))
95
+ return err
80
96
}
81
97
82
98
in .metrics = newInputMetrics (inputContext .MetricsRegistry , in .config .NumberOfWorkers )
@@ -97,6 +113,7 @@ func (in *s3PollerInput) Run(
97
113
func (in * s3PollerInput ) run (ctx context.Context ) {
98
114
// Scan the bucket in a loop, delaying by the configured interval each
99
115
// iteration.
116
+ in .status .UpdateStatus (status .Running , "Input is running" )
100
117
for ctx .Err () == nil {
101
118
in .runPoll (ctx )
102
119
_ = timed .Wait (ctx , in .config .BucketListInterval )
@@ -129,6 +146,7 @@ func (in *s3PollerInput) runPoll(ctx context.Context) {
129
146
err := in .states .CleanUp (ids )
130
147
if err != nil {
131
148
in .log .Errorf ("failed to cleanup states: %v" , err .Error ())
149
+ in .status .UpdateStatus (status .Degraded , fmt .Sprintf ("Input state cleanup failure: %s" , err .Error ()))
132
150
}
133
151
}
134
152
@@ -138,13 +156,13 @@ func (in *s3PollerInput) workerLoop(ctx context.Context, workChan <-chan state)
138
156
client , err := createPipelineClient (in .pipeline , acks )
139
157
if err != nil {
140
158
in .log .Errorf ("failed to create pipeline client: %v" , err .Error ())
159
+ in .status .UpdateStatus (status .Degraded , fmt .Sprintf ("A worker's pipeline client setup failed, error: %s" , err .Error ()))
141
160
return
142
161
}
143
162
defer client .Close ()
144
163
defer acks .Close ()
145
164
146
165
rateLimitWaiter := backoff .NewEqualJitterBackoff (ctx .Done (), 1 , 120 )
147
-
148
166
for _state := range workChan {
149
167
state := _state
150
168
event := in .s3EventForState (state )
@@ -166,6 +184,9 @@ func (in *s3PollerInput) workerLoop(ctx context.Context, workChan <-chan state)
166
184
if errors .Is (err , errS3DownloadFailed ) {
167
185
// Download errors are ephemeral. Add a backoff delay, then skip to the
168
186
// next iteration so we don't mark the object as permanently failed.
187
+ in .status .UpdateStatus (status .Degraded ,
188
+ fmt .Sprintf ("S3 download failure for object key '%s' in bucket '%s': %s" ,
189
+ state .Key , state .Bucket , err .Error ()))
169
190
rateLimitWaiter .Wait ()
170
191
continue
171
192
}
@@ -176,7 +197,10 @@ func (in *s3PollerInput) workerLoop(ctx context.Context, workChan <-chan state)
176
197
if err != nil {
177
198
in .log .Errorf ("failed processing S3 event for object key %q in bucket %q: %v" ,
178
199
state .Key , state .Bucket , err .Error ())
179
-
200
+ in .status .UpdateStatus (status .Degraded ,
201
+ fmt .Sprintf (
202
+ "S3 object processing failure for object key '%s' in bucket '%s': %s" ,
203
+ state .Key , state .Bucket , err .Error ()))
180
204
// Non-retryable error.
181
205
state .Failed = true
182
206
} else {
@@ -188,6 +212,9 @@ func (in *s3PollerInput) workerLoop(ctx context.Context, workChan <-chan state)
188
212
err := in .states .AddState (state )
189
213
if err != nil {
190
214
in .log .Errorf ("saving completed object state: %v" , err .Error ())
215
+ in .status .UpdateStatus (status .Degraded , fmt .Sprintf ("Failure checkpointing (saving completed object state): %s" , err .Error ()))
216
+ } else {
217
+ in .status .UpdateStatus (status .Running , "Input is running" )
191
218
}
192
219
193
220
// Metrics
@@ -213,12 +240,14 @@ func (in *s3PollerInput) readerLoop(ctx context.Context, workChan chan<- state)
213
240
214
241
if err != nil {
215
242
in .log .Warnw ("Error when paginating listing." , "error" , err )
243
+ in .status .UpdateStatus (status .Degraded , fmt .Sprintf ("S3 pagination error: %s" , err .Error ()))
216
244
// QuotaExceededError is client-side rate limiting in the AWS sdk,
217
245
// don't include it in the circuit breaker count
218
246
if ! errors .As (err , & ratelimit.QuotaExceededError {}) {
219
247
circuitBreaker ++
220
248
if circuitBreaker >= readerLoopMaxCircuitBreaker {
221
249
in .log .Warnw (fmt .Sprintf ("%d consecutive error when paginating listing, breaking the circuit." , circuitBreaker ), "error" , err )
250
+ in .status .UpdateStatus (status .Degraded , fmt .Sprintf ("Too many consecutive errors (%d) in S3 pagination. Error: %s" , circuitBreaker , err .Error ()))
222
251
return nil , false
223
252
}
224
253
}
0 commit comments