[chore] Add ability to configure sizer for default batcher (#12744)

bogdandrutu · web-flow · commit 450a4cee3b73 · 2025-03-26T20:47:57.000Z
This functionality is not yet exposed to the users, in next PRs will be.

Signed-off-by: Bogdan Drutu &lt;bogdandrutu@gmail.com&gt;
diff --git a/exporter/exporterhelper/internal/queue_sender.go b/exporter/exporterhelper/internal/queue_sender.go
@@ -203,9 +203,9 @@ type SizeConfig struct {
 	Sizer request.SizerType `mapstructure:"sizer"`
 
 	// MinSize defines the configuration for the minimum size of a batch.
-	MinSize int `mapstructure:"min_size"`
+	MinSize int64 `mapstructure:"min_size"`
 	// MaxSize defines the configuration for the maximum size of a batch.
-	MaxSize int `mapstructure:"max_size"`
+	MaxSize int64 `mapstructure:"max_size"`
 }
 
 func (c *BatcherConfig) Validate() error {
diff --git a/exporter/exporterhelper/internal/queuebatch/config.go b/exporter/exporterhelper/internal/queuebatch/config.go
@@ -79,10 +79,10 @@ type BatchConfig struct {
 	FlushTimeout time.Duration `mapstructure:"flush_timeout"`
 
 	// MinSize defines the configuration for the minimum size of a batch.
-	MinSize int `mapstructure:"min_size"`
+	MinSize int64 `mapstructure:"min_size"`
 
 	// MaxSize defines the configuration for the maximum size of a batch.
-	MaxSize int `mapstructure:"max_size"`
+	MaxSize int64 `mapstructure:"max_size"`
 }
 
 func (cfg *BatchConfig) Validate() error {
diff --git a/exporter/exporterhelper/internal/queuebatch/default_batcher.go b/exporter/exporterhelper/internal/queuebatch/default_batcher.go
@@ -21,10 +21,19 @@ type batch struct {
 	done multiDone
 }
 
+type batcherSettings[K any] struct {
+	sizerType  request.SizerType
+	sizer      request.Sizer[K]
+	next       sender.SendFunc[K]
+	maxWorkers int
+}
+
 // defaultBatcher continuously batch incoming requests and flushes asynchronously if minimum size limit is met or on timeout.
 type defaultBatcher struct {
-	batchCfg       BatchConfig
+	cfg            BatchConfig
 	workerPool     chan struct{}
+	sizerType      request.SizerType
+	sizer          request.Sizer[request.Request]
 	consumeFunc    sender.SendFunc[request.Request]
 	stopWG         sync.WaitGroup
 	currentBatchMu sync.Mutex
@@ -33,35 +42,37 @@ type defaultBatcher struct {
 	shutdownCh     chan struct{}
 }
 
-func newDefaultBatcher(batchCfg BatchConfig, consumeFunc sender.SendFunc[request.Request], maxWorkers int) *defaultBatcher {
+func newDefaultBatcher(bCfg BatchConfig, bSet batcherSettings[request.Request]) *defaultBatcher {
 	// TODO: Determine what is the right behavior for this in combination with async queue.
 	var workerPool chan struct{}
-	if maxWorkers != 0 {
-		workerPool = make(chan struct{}, maxWorkers)
-		for i := 0; i < maxWorkers; i++ {
+	if bSet.maxWorkers != 0 {
+		workerPool = make(chan struct{}, bSet.maxWorkers)
+		for i := 0; i < bSet.maxWorkers; i++ {
 			workerPool <- struct{}{}
 		}
 	}
 	return &defaultBatcher{
-		batchCfg:    batchCfg,
+		cfg:         bCfg,
 		workerPool:  workerPool,
-		consumeFunc: consumeFunc,
+		sizerType:   bSet.sizerType,
+		sizer:       bSet.sizer,
+		consumeFunc: bSet.next,
 		stopWG:      sync.WaitGroup{},
 		shutdownCh:  make(chan struct{}, 1),
 	}
 }
 
 func (qb *defaultBatcher) resetTimer() {
-	if qb.batchCfg.FlushTimeout > 0 {
-		qb.timer.Reset(qb.batchCfg.FlushTimeout)
+	if qb.cfg.FlushTimeout > 0 {
+		qb.timer.Reset(qb.cfg.FlushTimeout)
 	}
 }
 
 func (qb *defaultBatcher) Consume(ctx context.Context, req request.Request, done Done) {
 	qb.currentBatchMu.Lock()
 
 	if qb.currentBatch == nil {
-		reqList, mergeSplitErr := req.MergeSplit(ctx, qb.batchCfg.MaxSize, request.SizerTypeItems, nil)
+		reqList, mergeSplitErr := req.MergeSplit(ctx, int(qb.cfg.MaxSize), qb.sizerType, nil)
 		if mergeSplitErr != nil || len(reqList) == 0 {
 			done.OnDone(mergeSplitErr)
 			qb.currentBatchMu.Unlock()
@@ -76,7 +87,7 @@ func (qb *defaultBatcher) Consume(ctx context.Context, req request.Request, done
 		// We have at least one result in the reqList. Last in the list may not have enough data to be flushed.
 		// Find if it has at least MinSize, and if it does then move that as the current batch.
 		lastReq := reqList[len(reqList)-1]
-		if lastReq.ItemsCount() < qb.batchCfg.MinSize {
+		if qb.sizer.Sizeof(lastReq) < qb.cfg.MinSize {
 			// Do not flush the last item and add it to the current batch.
 			reqList = reqList[:len(reqList)-1]
 			qb.currentBatch = &batch{
@@ -95,7 +106,7 @@ func (qb *defaultBatcher) Consume(ctx context.Context, req request.Request, done
 		return
 	}
 
-	reqList, mergeSplitErr := qb.currentBatch.req.MergeSplit(ctx, qb.batchCfg.MaxSize, request.SizerTypeItems, req)
+	reqList, mergeSplitErr := qb.currentBatch.req.MergeSplit(ctx, int(qb.cfg.MaxSize), qb.sizerType, req)
 	// If failed to merge signal all Done callbacks from current batch as well as the current request and reset the current batch.
 	if mergeSplitErr != nil || len(reqList) == 0 {
 		done.OnDone(mergeSplitErr)
@@ -121,7 +132,7 @@ func (qb *defaultBatcher) Consume(ctx context.Context, req request.Request, done
 	// cannot unlock and re-lock because we are not done processing all the responses.
 	var firstBatch *batch
 	// Need to check the currentBatch if more than 1 result returned or if 1 result return but larger than MinSize.
-	if len(reqList) > 1 || qb.currentBatch.req.ItemsCount() >= qb.batchCfg.MinSize {
+	if len(reqList) > 1 || qb.sizer.Sizeof(qb.currentBatch.req) >= qb.cfg.MinSize {
 		firstBatch = qb.currentBatch
 		qb.currentBatch = nil
 	}
@@ -131,7 +142,7 @@ func (qb *defaultBatcher) Consume(ctx context.Context, req request.Request, done
 	// If we still have results to process, then we need to check if the last result has enough data to flush, or we add it to the currentBatch.
 	if len(reqList) > 0 {
 		lastReq := reqList[len(reqList)-1]
-		if lastReq.ItemsCount() < qb.batchCfg.MinSize {
+		if qb.sizer.Sizeof(lastReq) < qb.cfg.MinSize {
 			// Do not flush the last item and add it to the current batch.
 			reqList = reqList[:len(reqList)-1]
 			qb.currentBatch = &batch{
@@ -170,8 +181,8 @@ func (qb *defaultBatcher) startTimeBasedFlushingGoroutine() {
 
 // Start starts the goroutine that reads from the queue and flushes asynchronously.
 func (qb *defaultBatcher) Start(_ context.Context, _ component.Host) error {
-	if qb.batchCfg.FlushTimeout > 0 {
-		qb.timer = time.NewTimer(qb.batchCfg.FlushTimeout)
+	if qb.cfg.FlushTimeout > 0 {
+		qb.timer = time.NewTimer(qb.cfg.FlushTimeout)
 		qb.startTimeBasedFlushingGoroutine()
 	}
 
diff --git a/exporter/exporterhelper/internal/queuebatch/default_batcher_test.go b/exporter/exporterhelper/internal/queuebatch/default_batcher_test.go
@@ -15,6 +15,7 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"go.opentelemetry.io/collector/component/componenttest"
+	"go.opentelemetry.io/collector/exporter/exporterhelper/internal/request"
 	"go.opentelemetry.io/collector/exporter/exporterhelper/internal/requesttest"
 )
 
@@ -40,7 +41,12 @@ func TestDefaultBatcher_NoSplit_MinThresholdZero_TimeoutDisabled(t *testing.T) {
 			}
 
 			sink := requesttest.NewSink()
-			ba := newDefaultBatcher(cfg, sink.Export, tt.maxWorkers)
+			ba := newDefaultBatcher(cfg, batcherSettings[request.Request]{
+				sizerType:  request.SizerTypeItems,
+				sizer:      request.NewItemsSizer(),
+				next:       sink.Export,
+				maxWorkers: tt.maxWorkers,
+			})
 			require.NoError(t, ba.Start(context.Background(), componenttest.NewNopHost()))
 			t.Cleanup(func() {
 				require.NoError(t, ba.Shutdown(context.Background()))
@@ -87,7 +93,12 @@ func TestDefaultBatcher_NoSplit_TimeoutDisabled(t *testing.T) {
 			}
 
 			sink := requesttest.NewSink()
-			ba := newDefaultBatcher(cfg, sink.Export, tt.maxWorkers)
+			ba := newDefaultBatcher(cfg, batcherSettings[request.Request]{
+				sizerType:  request.SizerTypeItems,
+				sizer:      request.NewItemsSizer(),
+				next:       sink.Export,
+				maxWorkers: tt.maxWorkers,
+			})
 			require.NoError(t, ba.Start(context.Background(), componenttest.NewNopHost()))
 
 			done := newFakeDone()
@@ -149,7 +160,12 @@ func TestDefaultBatcher_NoSplit_WithTimeout(t *testing.T) {
 			}
 
 			sink := requesttest.NewSink()
-			ba := newDefaultBatcher(cfg, sink.Export, tt.maxWorkers)
+			ba := newDefaultBatcher(cfg, batcherSettings[request.Request]{
+				sizerType:  request.SizerTypeItems,
+				sizer:      request.NewItemsSizer(),
+				next:       sink.Export,
+				maxWorkers: tt.maxWorkers,
+			})
 			require.NoError(t, ba.Start(context.Background(), componenttest.NewNopHost()))
 			t.Cleanup(func() {
 				require.NoError(t, ba.Shutdown(context.Background()))
@@ -202,7 +218,12 @@ func TestDefaultBatcher_Split_TimeoutDisabled(t *testing.T) {
 			}
 
 			sink := requesttest.NewSink()
-			ba := newDefaultBatcher(cfg, sink.Export, tt.maxWorkers)
+			ba := newDefaultBatcher(cfg, batcherSettings[request.Request]{
+				sizerType:  request.SizerTypeItems,
+				sizer:      request.NewItemsSizer(),
+				next:       sink.Export,
+				maxWorkers: tt.maxWorkers,
+			})
 			require.NoError(t, ba.Start(context.Background(), componenttest.NewNopHost()))
 
 			done := newFakeDone()
@@ -249,7 +270,12 @@ func TestDefaultBatcher_Shutdown(t *testing.T) {
 	}
 
 	sink := requesttest.NewSink()
-	ba := newDefaultBatcher(cfg, sink.Export, 2)
+	ba := newDefaultBatcher(cfg, batcherSettings[request.Request]{
+		sizerType:  request.SizerTypeItems,
+		sizer:      request.NewItemsSizer(),
+		next:       sink.Export,
+		maxWorkers: 2,
+	})
 	require.NoError(t, ba.Start(context.Background(), componenttest.NewNopHost()))
 
 	done := newFakeDone()
@@ -277,7 +303,12 @@ func TestDefaultBatcher_MergeError(t *testing.T) {
 	}
 
 	sink := requesttest.NewSink()
-	ba := newDefaultBatcher(cfg, sink.Export, 2)
+	ba := newDefaultBatcher(cfg, batcherSettings[request.Request]{
+		sizerType:  request.SizerTypeItems,
+		sizer:      request.NewItemsSizer(),
+		next:       sink.Export,
+		maxWorkers: 2,
+	})
 
 	require.NoError(t, ba.Start(context.Background(), componenttest.NewNopHost()))
 	t.Cleanup(func() {
diff --git a/exporter/exporterhelper/internal/queuebatch/queue_batch.go b/exporter/exporterhelper/internal/queuebatch/queue_batch.go
@@ -40,7 +40,12 @@ func NewQueueBatch(
 	default:
 		// TODO: https://github.com/open-telemetry/opentelemetry-collector/issues/12244
 		cfg.NumConsumers = 1
-		b = newDefaultBatcher(*cfg.Batch, next, cfg.NumConsumers)
+		b = newDefaultBatcher(*cfg.Batch, batcherSettings[request.Request]{
+			sizerType:  request.SizerTypeItems,
+			sizer:      request.NewItemsSizer(),
+			next:       next,
+			maxWorkers: cfg.NumConsumers,
+		})
 	}
 
 	var q Queue[request.Request]