Skip to content

Commit 78a77fe

Browse files
committed
implement exponential backoff in case of memory limiter error when consuming traces
1 parent f21f718 commit 78a77fe

File tree

5 files changed

+111
-26
lines changed

5 files changed

+111
-26
lines changed

receiver/kafkareceiver/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
"go.opentelemetry.io/collector/component"
10+
"go.opentelemetry.io/collector/config/configretry"
1011

1112
"github.com/open-telemetry/opentelemetry-collector-contrib/exporter/kafkaexporter"
1213
"github.com/open-telemetry/opentelemetry-collector-contrib/internal/kafka"
@@ -85,6 +86,9 @@ type Config struct {
8586
DefaultFetchSize int32 `mapstructure:"default_fetch_size"`
8687
// The maximum bytes per fetch from Kafka (default "0", no limit)
8788
MaxFetchSize int32 `mapstructure:"max_fetch_size"`
89+
90+
// In case of some errors returned by the next consumer, the receiver will wait before consuming the next message
91+
ErrorBackOff configretry.BackOffConfig `mapstructure:"error_backoff"`
8892
}
8993

9094
const (

receiver/kafkareceiver/config_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/stretchr/testify/assert"
1212
"github.com/stretchr/testify/require"
1313
"go.opentelemetry.io/collector/component"
14+
"go.opentelemetry.io/collector/config/configretry"
1415
"go.opentelemetry.io/collector/config/configtls"
1516
"go.opentelemetry.io/collector/confmap/confmaptest"
1617

@@ -65,6 +66,9 @@ func TestLoadConfig(t *testing.T) {
6566
MinFetchSize: 1,
6667
DefaultFetchSize: 1048576,
6768
MaxFetchSize: 0,
69+
ErrorBackOff: configretry.BackOffConfig{
70+
Enabled: false,
71+
},
6872
},
6973
},
7074
{
@@ -101,6 +105,13 @@ func TestLoadConfig(t *testing.T) {
101105
MinFetchSize: 1,
102106
DefaultFetchSize: 1048576,
103107
MaxFetchSize: 0,
108+
ErrorBackOff: configretry.BackOffConfig{
109+
Enabled: true,
110+
InitialInterval: 1 * time.Second,
111+
MaxInterval: 10 * time.Second,
112+
MaxElapsedTime: 1 * time.Minute,
113+
Multiplier: 1.5,
114+
},
104115
},
105116
},
106117
}

receiver/kafkareceiver/kafka_receiver.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@ import (
99
"fmt"
1010
"strconv"
1111
"sync"
12+
"time"
1213

1314
"github.com/IBM/sarama"
15+
"github.com/cenkalti/backoff/v4"
1416
"go.opentelemetry.io/collector/component"
17+
"go.opentelemetry.io/collector/config/configretry"
1518
"go.opentelemetry.io/collector/consumer"
1619
"go.opentelemetry.io/collector/pdata/plog"
1720
"go.opentelemetry.io/collector/pdata/pmetric"
@@ -35,6 +38,8 @@ const (
3538

3639
var errInvalidInitialOffset = errors.New("invalid initial offset")
3740

41+
var errMemoryLimiterDataRefused = errors.New("data refused due to high memory usage")
42+
3843
// kafkaTracesConsumer uses sarama to consume and handle messages from kafka.
3944
type kafkaTracesConsumer struct {
4045
config Config
@@ -205,6 +210,7 @@ func (c *kafkaTracesConsumer) Start(_ context.Context, host component.Host) erro
205210
messageMarking: c.messageMarking,
206211
headerExtractor: &nopHeaderExtractor{},
207212
telemetryBuilder: c.telemetryBuilder,
213+
backOff: newExponentialBackOff(c.config.ErrorBackOff),
208214
}
209215
if c.headerExtraction {
210216
consumerGroup.headerExtractor = &headerExtractor{
@@ -218,6 +224,20 @@ func (c *kafkaTracesConsumer) Start(_ context.Context, host component.Host) erro
218224
return nil
219225
}
220226

227+
func newExponentialBackOff(config configretry.BackOffConfig) *backoff.ExponentialBackOff {
228+
if !config.Enabled {
229+
return nil
230+
}
231+
backOff := backoff.NewExponentialBackOff()
232+
backOff.InitialInterval = config.InitialInterval
233+
backOff.RandomizationFactor = config.RandomizationFactor
234+
backOff.Multiplier = config.Multiplier
235+
backOff.MaxInterval = config.MaxInterval
236+
backOff.MaxElapsedTime = config.MaxElapsedTime
237+
backOff.Reset()
238+
return backOff
239+
}
240+
221241
func (c *kafkaTracesConsumer) consumeLoop(ctx context.Context, handler sarama.ConsumerGroupHandler) {
222242
defer c.consumeLoopWG.Done()
223243
for {
@@ -481,6 +501,7 @@ type tracesConsumerGroupHandler struct {
481501
autocommitEnabled bool
482502
messageMarking MessageMarking
483503
headerExtractor HeaderExtractor
504+
backOff *backoff.ExponentialBackOff
484505
}
485506

486507
type metricsConsumerGroupHandler struct {
@@ -582,8 +603,18 @@ func (c *tracesConsumerGroupHandler) ConsumeClaim(session sarama.ConsumerGroupSe
582603
if c.messageMarking.After && c.messageMarking.OnError {
583604
session.MarkMessage(message, "")
584605
}
606+
if errorRequiresBackoff(err) && c.backOff != nil {
607+
select {
608+
case <-session.Context().Done():
609+
return nil
610+
case <-time.After(c.backOff.NextBackOff()):
611+
}
612+
}
585613
return err
586614
}
615+
if c.backOff != nil {
616+
c.backOff.Reset()
617+
}
587618
if c.messageMarking.After {
588619
session.MarkMessage(message, "")
589620
}
@@ -600,6 +631,10 @@ func (c *tracesConsumerGroupHandler) ConsumeClaim(session sarama.ConsumerGroupSe
600631
}
601632
}
602633

634+
func errorRequiresBackoff(err error) bool {
635+
return err.Error() == errMemoryLimiterDataRefused.Error()
636+
}
637+
603638
func (c *metricsConsumerGroupHandler) Setup(session sarama.ConsumerGroupSession) error {
604639
c.readyCloser.Do(func() {
605640
close(c.ready)

receiver/kafkareceiver/kafka_receiver_test.go

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/IBM/sarama"
14+
"github.com/cenkalti/backoff/v4"
1415
"github.com/stretchr/testify/assert"
1516
"github.com/stretchr/testify/require"
1617
"go.opentelemetry.io/collector/component"
@@ -340,35 +341,63 @@ func TestTracesConsumerGroupHandler_error_nextConsumer(t *testing.T) {
340341
consumerError := errors.New("failed to consume")
341342
obsrecv, err := receiverhelper.NewObsReport(receiverhelper.ObsReportSettings{ReceiverCreateSettings: receivertest.NewNopSettings()})
342343
require.NoError(t, err)
343-
c := tracesConsumerGroupHandler{
344-
unmarshaler: newPdataTracesUnmarshaler(&ptrace.ProtoUnmarshaler{}, defaultEncoding),
345-
logger: zap.NewNop(),
346-
ready: make(chan bool),
347-
nextConsumer: consumertest.NewErr(consumerError),
348-
obsrecv: obsrecv,
349-
headerExtractor: &nopHeaderExtractor{},
350-
telemetryBuilder: nopTelemetryBuilder(t),
351-
}
352344

353-
wg := sync.WaitGroup{}
354-
wg.Add(1)
355-
groupClaim := &testConsumerGroupClaim{
356-
messageChan: make(chan *sarama.ConsumerMessage),
345+
tests := []struct {
346+
name string
347+
err error
348+
expectedBackoff time.Duration
349+
}{
350+
{
351+
name: "memory limiter data refused error",
352+
err: errMemoryLimiterDataRefused,
353+
expectedBackoff: backoff.DefaultInitialInterval,
354+
},
355+
{
356+
name: "consumer error that does not require backoff",
357+
err: consumerError,
358+
expectedBackoff: 0,
359+
},
357360
}
358-
go func() {
359-
e := c.ConsumeClaim(testConsumerGroupSession{ctx: context.Background()}, groupClaim)
360-
assert.EqualError(t, e, consumerError.Error())
361-
wg.Done()
362-
}()
363361

364-
td := ptrace.NewTraces()
365-
td.ResourceSpans().AppendEmpty()
366-
unmarshaler := &ptrace.ProtoMarshaler{}
367-
bts, err := unmarshaler.MarshalTraces(td)
368-
require.NoError(t, err)
369-
groupClaim.messageChan <- &sarama.ConsumerMessage{Value: bts}
370-
close(groupClaim.messageChan)
371-
wg.Wait()
362+
for _, tt := range tests {
363+
t.Run(tt.name, func(t *testing.T) {
364+
backOff := backoff.NewExponentialBackOff()
365+
backOff.RandomizationFactor = 0
366+
c := tracesConsumerGroupHandler{
367+
unmarshaler: newPdataTracesUnmarshaler(&ptrace.ProtoUnmarshaler{}, defaultEncoding),
368+
logger: zap.NewNop(),
369+
ready: make(chan bool),
370+
nextConsumer: consumertest.NewErr(tt.err),
371+
obsrecv: obsrecv,
372+
headerExtractor: &nopHeaderExtractor{},
373+
telemetryBuilder: nopTelemetryBuilder(t),
374+
backOff: backOff,
375+
}
376+
377+
wg := sync.WaitGroup{}
378+
wg.Add(1)
379+
groupClaim := &testConsumerGroupClaim{
380+
messageChan: make(chan *sarama.ConsumerMessage),
381+
}
382+
go func() {
383+
start := time.Now()
384+
e := c.ConsumeClaim(testConsumerGroupSession{ctx: context.Background()}, groupClaim)
385+
end := time.Now()
386+
assert.EqualError(t, e, tt.err.Error())
387+
assert.WithinDuration(t, start.Add(tt.expectedBackoff), end, 100*time.Millisecond)
388+
wg.Done()
389+
}()
390+
391+
td := ptrace.NewTraces()
392+
td.ResourceSpans().AppendEmpty()
393+
unmarshaler := &ptrace.ProtoMarshaler{}
394+
bts, err := unmarshaler.MarshalTraces(td)
395+
require.NoError(t, err)
396+
groupClaim.messageChan <- &sarama.ConsumerMessage{Value: bts}
397+
close(groupClaim.messageChan)
398+
wg.Wait()
399+
})
400+
}
372401
}
373402

374403
func TestTracesReceiver_encoding_extension(t *testing.T) {

receiver/kafkareceiver/testdata/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,9 @@ kafka/logs:
3535
retry:
3636
max: 10
3737
backoff: 5s
38+
error_backoff:
39+
enabled: true
40+
initial_interval: 1s
41+
max_interval: 10s
42+
max_elapsed_time: 1m
43+
multiplier: 1.5

0 commit comments

Comments
 (0)