Skip to content

Commit 4c2f59e

Browse files
authored
kafkareceiver: don't restart when marking errors (#40140)
#### Description If processing of a message fails but is still marked as consumed due to the `message_marking` configuration, log an error and continue instead of restarting the consumer. Restarting will not affect the outcome since the message will not be reprocessed due to being marked. #### Link to tracking issue Fixes #39909 #### Testing Added unit tests. #### Documentation None
1 parent 4cb1330 commit 4c2f59e

File tree

3 files changed

+145
-12
lines changed

3 files changed

+145
-12
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: bug_fix
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: kafkareceiver
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Don't restart the Kafka consumer on failed errors when message marking is enabled for them.
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [39909]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext: |
19+
The Kafka consumer will now only be restarted due to failed messages if
20+
both `message_marking::after` is true, and `message_marking::on_error` is false.
21+
In this case the message will not be marked as consumed, and the consumer will
22+
restart with an error.
23+
24+
# If your change doesn't affect end users or the exported elements of any package,
25+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
26+
# Optional: The change log or logs in which this entry should be included.
27+
# e.g. '[user]' or '[user, api]'
28+
# Include 'user' if the change is relevant to end users.
29+
# Include 'api' if there is a change to a library API.
30+
# Default: '[user]'
31+
change_logs: [user]

receiver/kafkareceiver/kafka_receiver.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -528,10 +528,19 @@ func (c *consumerGroupHandler) handleMessage(
528528
c.logger.Info("Stop error backoff because the configured max_elapsed_time is reached",
529529
zap.Duration("max_elapsed_time", c.backOff.MaxElapsedTime))
530530
}
531-
if c.messageMarking.After && c.messageMarking.OnError {
532-
session.MarkMessage(message, "")
531+
if c.messageMarking.After && !c.messageMarking.OnError {
532+
// Only return an error if messages are marked after successful processing.
533+
return err
533534
}
534-
return err
535+
// We're either marking messages as consumed ahead of time (disregarding outcome),
536+
// or after processing but including errors. Either way we should not return an error,
537+
// as that will restart the consumer unnecessarily.
538+
c.logger.Error("failed to consume message, skipping due to message_marking config",
539+
zap.Error(err),
540+
zap.String("topic", message.Topic),
541+
zap.Int32("partition", claim.Partition()),
542+
zap.Int64("offset", message.Offset),
543+
)
535544
}
536545
if c.backOff != nil {
537546
c.resetBackoff()

receiver/kafkareceiver/kafka_receiver_test.go

Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -292,10 +292,7 @@ func TestReceiver_InternalTelemetry(t *testing.T) {
292292

293293
// After receiving messages, the internal metrics should be updated.
294294
metadatatest.AssertEqualKafkaReceiverPartitionStart(t, tel, []metricdata.DataPoint[int64]{{
295-
// 2 because:
296-
// - the initial open
297-
// - the invalid message causes the consumer to restart, closing the partition
298-
Value: 2,
295+
Value: 1,
299296
Attributes: attribute.NewSet(attribute.String("name", set.ID.Name())),
300297
}}, metricdatatest.IgnoreTimestamp())
301298

@@ -312,19 +309,17 @@ func TestReceiver_InternalTelemetry(t *testing.T) {
312309
err = r.Shutdown(context.Background())
313310
require.NoError(t, err)
314311
metadatatest.AssertEqualKafkaReceiverPartitionClose(t, tel, []metricdata.DataPoint[int64]{{
315-
// 2 because:
316-
// - the invalid message causes the consumer to restart, closing the partition
317-
// - it re-acquires the partition, but then shutting down closes the partition again
318-
Value: 2,
312+
Value: 1,
319313
Attributes: attribute.NewSet(
320314
attribute.String("name", set.ID.Name()),
321315
),
322316
}}, metricdatatest.IgnoreTimestamp())
323317

324318
observedErrorLogs := observedLogs.FilterLevelExact(zapcore.ErrorLevel)
325319
logEntries := observedErrorLogs.All()
326-
assert.Len(t, logEntries, 1)
320+
assert.Len(t, logEntries, 2)
327321
assert.Equal(t, "failed to unmarshal message", logEntries[0].Message)
322+
assert.Equal(t, "failed to consume message, skipping due to message_marking config", logEntries[1].Message)
328323

329324
metadatatest.AssertEqualKafkaReceiverCurrentOffset(t, tel, []metricdata.DataPoint[int64]{{
330325
Value: 4, // offset of the final message
@@ -345,6 +340,104 @@ func TestReceiver_InternalTelemetry(t *testing.T) {
345340
}}, metricdatatest.IgnoreTimestamp())
346341
}
347342

343+
func TestReceiver_MessageMarking(t *testing.T) {
344+
t.Parallel()
345+
for name, testcase := range map[string]struct {
346+
markAfter bool
347+
markErrors bool
348+
349+
errorShouldRestart bool
350+
}{
351+
"mark_before": {
352+
markAfter: false,
353+
},
354+
"mark_after_success": {
355+
markAfter: true,
356+
errorShouldRestart: true,
357+
},
358+
"mark_after_all": {
359+
markAfter: true,
360+
markErrors: true,
361+
},
362+
} {
363+
t.Run(name, func(t *testing.T) {
364+
t.Parallel()
365+
kafkaClient, receiverConfig := mustNewFakeCluster(t, kfake.SeedTopics(1, "otlp_spans"))
366+
367+
// Send some invalid data to the otlp_spans topic so unmarshaling fails,
368+
// and then send some valid data to show that the invalid data does not
369+
// block the consumer.
370+
traces := testdata.GenerateTraces(1)
371+
data, err := (&ptrace.ProtoMarshaler{}).MarshalTraces(traces)
372+
require.NoError(t, err)
373+
results := kafkaClient.ProduceSync(context.Background(),
374+
&kgo.Record{Topic: "otlp_spans", Value: []byte("junk")},
375+
&kgo.Record{Topic: "otlp_spans", Value: data},
376+
)
377+
require.NoError(t, results.FirstErr())
378+
379+
var calls atomic.Int64
380+
consumer := newTracesConsumer(func(_ context.Context, received ptrace.Traces) error {
381+
calls.Add(1)
382+
return ptracetest.CompareTraces(traces, received)
383+
})
384+
385+
// Only mark messages after consuming, including for errors.
386+
receiverConfig.MessageMarking.After = testcase.markAfter
387+
receiverConfig.MessageMarking.OnError = testcase.markErrors
388+
set, tel, observedLogs := mustNewSettings(t)
389+
f := NewFactory()
390+
r, err := f.CreateTraces(context.Background(), set, receiverConfig, consumer)
391+
require.NoError(t, err)
392+
require.NoError(t, r.Start(context.Background(), componenttest.NewNopHost()))
393+
t.Cleanup(func() {
394+
assert.NoError(t, r.Shutdown(context.Background()))
395+
})
396+
397+
if testcase.errorShouldRestart {
398+
// Verify that the consumer restarts at least once.
399+
assert.Eventually(t, func() bool {
400+
m, err := tel.GetMetric("otelcol_kafka_receiver_partition_start")
401+
require.NoError(t, err)
402+
403+
dataPoints := m.Data.(metricdata.Sum[int64]).DataPoints
404+
assert.Len(t, dataPoints, 1)
405+
return dataPoints[0].Value > 5
406+
}, time.Second, 100*time.Millisecond, "unmarshal error should restart consumer")
407+
408+
// The invalid message should block the consumer.
409+
assert.Zero(t, calls.Load())
410+
411+
observedErrorLogs := observedLogs.FilterLevelExact(zapcore.ErrorLevel)
412+
logEntries := observedErrorLogs.All()
413+
require.NotEmpty(t, logEntries)
414+
for _, entry := range logEntries {
415+
assert.Equal(t, "failed to unmarshal message", entry.Message)
416+
}
417+
} else {
418+
assert.Eventually(t, func() bool {
419+
return calls.Load() == 1
420+
}, time.Second, 100*time.Millisecond, "unmarshal error should not block consumption")
421+
422+
// Verify that the consumer did not restart.
423+
metadatatest.AssertEqualKafkaReceiverPartitionStart(t, tel, []metricdata.DataPoint[int64]{{
424+
Value: 1,
425+
Attributes: attribute.NewSet(attribute.String("name", set.ID.Name())),
426+
}}, metricdatatest.IgnoreTimestamp())
427+
428+
observedErrorLogs := observedLogs.FilterLevelExact(zapcore.ErrorLevel)
429+
logEntries := observedErrorLogs.All()
430+
require.Len(t, logEntries, 2)
431+
assert.Equal(t, "failed to unmarshal message", logEntries[0].Message)
432+
assert.Equal(t,
433+
"failed to consume message, skipping due to message_marking config",
434+
logEntries[1].Message,
435+
)
436+
}
437+
})
438+
}
439+
}
440+
348441
func TestNewLogsReceiver(t *testing.T) {
349442
t.Parallel()
350443
kafkaClient, receiverConfig := mustNewFakeCluster(t, kfake.SeedTopics(1, "otlp_logs"))

0 commit comments

Comments
 (0)