Skip to content

Commit 02d0102

Browse files
feat: adaptive array builders base types and string implementation (#473)
Part of: #533 Very rough implementation of adaptive array builders. This my "rust" version of the builder's we've implemented in golang here: https://github.com/open-telemetry/otel-arrow/blob/main/go/pkg/otel/common/schema/builder/record.go The idea behind these is that when we're encoding OTAP records, we often want to dynamically create columns in some record batch that that either aren't added to the record batch (if all the values are null), or are dictionary encoded with the smallest possible index, or are the native array if the dictionary index would overflow. (Some of this was alluded to in yesterday's SIG meeting). The intended usage is something like this: ```rs use otel_arrow_rust::encode::record::array::StringArrayBuilder; let mut str_builder = StringArrayBuilder::new(ArrayOptions { nullable: true, dictionary_options: Some(DictionaryOptions { min_cardinality: u8::MAX.into(), max_cardinality: u16::MAX, }), }); // maybe append some values str_builder.append_value(&"a".to_string()); let result = str_builder.finish(); let mut fields = Vec::new(); let mut columns = Vec::new(); if let Some(result) = result { fields.push(Field:new("str", result.data_type, true)); columns.push(result.array); } let record_batch = RecordBatch::try_new( Arc::new(Schema::new(fields)), columns ) .expect("should work"); ``` Followup work includes: - null support #534 - additional datatype support: #535 - optimize the conversion between Dict<u8> -> Dict<u16> #536 --------- Co-authored-by: Laurent Quérel <[email protected]> Co-authored-by: Laurent Quérel <[email protected]>
1 parent ac1d1c9 commit 02d0102

File tree

6 files changed

+932
-0
lines changed

6 files changed

+932
-0
lines changed

rust/otel-arrow-rust/src/encode.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
mod record;
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// TODO remove allow(unused) when we use what's in this module to encode OTAP
5+
#[allow(unused)]
6+
mod array;
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! This module provides adaptive array builders for Arrow arrays.
5+
//!
6+
//! Often for OTel-Arrow, we have columns that are optional on the schema. For example, the Boolean
7+
//! column may not be present in a record batch representing a list of attributes only be present
8+
//! if there are some boolean type values in the list.
9+
//!
10+
//! There are also cases where we want to dynamically use dictionary encoding with the smallest index
11+
//! the cardinality of the allows.
12+
//!
13+
//! This module contains adaptive array builders that can dynamically create either no array (for
14+
//! an all-null) column, an array that may be a dictionary, of an array or native types. It will
15+
//! handle converting between different builders dynamically based on the data which is appended.
16+
17+
use arrow::array::{ArrayRef, StringBuilder, StringDictionaryBuilder};
18+
use arrow::datatypes::{DataType, UInt8Type, UInt16Type};
19+
20+
use crate::arrays::NullableArrayAccessor;
21+
22+
use dictionary::{
23+
AdaptiveDictionaryBuilder, ConvertToNativeHelper, DictionaryArrayBuilder,
24+
DictionaryBuilderError, DictionaryOptions, UpdateDictionaryIndexInto,
25+
};
26+
27+
pub mod dictionary;
28+
pub mod string;
29+
30+
/// This is the base trait that array builders should implement.
31+
pub trait ArrayBuilder {
32+
type Native;
33+
34+
fn append_value(&mut self, value: &Self::Native);
35+
36+
fn finish(&mut self) -> ArrayWithType;
37+
}
38+
39+
pub struct ArrayWithType {
40+
pub array: ArrayRef,
41+
pub data_type: DataType,
42+
}
43+
44+
/// This is a helper trait that allows the adaptive builders to construct new
45+
/// instances of the builder dynamically
46+
pub trait ArrayBuilderConstructor {
47+
fn new() -> Self;
48+
49+
// TODO, at some point we may consider optionally adding a
50+
// with_capacity function here that could be used to create
51+
// a builder with pre-allocated buffers
52+
}
53+
54+
/// This enum is a container that abstracts array builder which is either
55+
/// dictionary or native. It converts from the dictionary builder to the
56+
/// native builder when the dictionary builder overflows.
57+
enum MaybeDictionaryBuilder<
58+
NativeBuilder: ArrayBuilder + ArrayBuilderConstructor,
59+
DictBuilderU8: DictionaryArrayBuilder<UInt8Type> + ArrayBuilderConstructor,
60+
DictBuilderU16: DictionaryArrayBuilder<UInt16Type> + ArrayBuilderConstructor,
61+
> {
62+
Native(NativeBuilder),
63+
Dictionary(AdaptiveDictionaryBuilder<DictBuilderU8, DictBuilderU16>),
64+
}
65+
66+
impl<T, TN, TD8, TD16> ArrayBuilder for MaybeDictionaryBuilder<TN, TD8, TD16>
67+
where
68+
TN: ArrayBuilder<Native = T> + ArrayBuilderConstructor,
69+
TD8: DictionaryArrayBuilder<UInt8Type, Native = T>
70+
+ ArrayBuilderConstructor
71+
+ ConvertToNativeHelper,
72+
<TD8 as ConvertToNativeHelper>::Accessor: NullableArrayAccessor<Native = T> + 'static,
73+
TD16: DictionaryArrayBuilder<UInt16Type, Native = T>
74+
+ ArrayBuilderConstructor
75+
+ ConvertToNativeHelper,
76+
<TD16 as ConvertToNativeHelper>::Accessor: NullableArrayAccessor<Native = T> + 'static,
77+
TD8: UpdateDictionaryIndexInto<TD16>,
78+
{
79+
type Native = T;
80+
81+
fn append_value(
82+
&mut self,
83+
value: &<MaybeDictionaryBuilder<TN, TD8, TD16> as ArrayBuilder>::Native,
84+
) {
85+
match self {
86+
Self::Native(array_builder) => array_builder.append_value(value),
87+
Self::Dictionary(dict_array_builder) => match dict_array_builder.append_value(value) {
88+
// we've overflowed the dictionary, so we must convert to the native builder type
89+
Err(DictionaryBuilderError::DictOverflow {}) => {
90+
let mut native = TN::new();
91+
dict_array_builder.to_native(&mut native);
92+
native.append_value(value);
93+
*self = Self::Native(native);
94+
}
95+
_ => {
96+
// do nothing here, as the append was successful
97+
}
98+
},
99+
}
100+
}
101+
102+
fn finish(&mut self) -> ArrayWithType {
103+
match self {
104+
Self::Dictionary(dict_array_builder) => dict_array_builder.finish(),
105+
Self::Native(array_builder) => array_builder.finish(),
106+
}
107+
}
108+
}
109+
110+
#[derive(Default)]
111+
pub struct ArrayOptions {
112+
pub dictionary_options: Option<DictionaryOptions>,
113+
pub nullable: bool,
114+
}
115+
116+
pub struct AdaptiveArrayBuilder<
117+
TN: ArrayBuilder + ArrayBuilderConstructor,
118+
TD8: DictionaryArrayBuilder<UInt8Type> + ArrayBuilderConstructor,
119+
TD16: DictionaryArrayBuilder<UInt16Type> + ArrayBuilderConstructor,
120+
> {
121+
dictionary_options: Option<DictionaryOptions>,
122+
inner: Option<MaybeDictionaryBuilder<TN, TD8, TD16>>,
123+
}
124+
125+
impl<T, TN, TD8, TD16> AdaptiveArrayBuilder<TN, TD8, TD16>
126+
where
127+
TN: ArrayBuilder<Native = T> + ArrayBuilderConstructor,
128+
TD8: DictionaryArrayBuilder<UInt8Type, Native = T>
129+
+ ArrayBuilderConstructor
130+
+ ConvertToNativeHelper,
131+
<TD8 as ConvertToNativeHelper>::Accessor: NullableArrayAccessor<Native = T> + 'static,
132+
TD16: DictionaryArrayBuilder<UInt16Type, Native = T>
133+
+ ArrayBuilderConstructor
134+
+ ConvertToNativeHelper,
135+
<TD16 as ConvertToNativeHelper>::Accessor: NullableArrayAccessor<Native = T> + 'static,
136+
TD8: UpdateDictionaryIndexInto<TD16>,
137+
{
138+
pub fn new(options: ArrayOptions) -> Self {
139+
let inner = if options.nullable {
140+
None
141+
} else {
142+
Some(Self::initial_builder(&options.dictionary_options))
143+
};
144+
145+
Self {
146+
dictionary_options: options.dictionary_options,
147+
inner,
148+
}
149+
}
150+
151+
// Initializes the builder, which may either be a builder for the, if dictionary
152+
// options is `Some`, otherwise it will construct the native builder builder variant
153+
fn initial_builder(
154+
dictionary_options: &Option<DictionaryOptions>,
155+
) -> MaybeDictionaryBuilder<TN, TD8, TD16> {
156+
match dictionary_options.as_ref() {
157+
Some(dictionary_options) => MaybeDictionaryBuilder::Dictionary(
158+
AdaptiveDictionaryBuilder::new(dictionary_options),
159+
),
160+
None => MaybeDictionaryBuilder::Native(TN::new()),
161+
}
162+
}
163+
164+
fn append_value(&mut self, value: &T) {
165+
if self.inner.is_none() {
166+
// TODO -- when we handle nulls here we need to keep track of how many
167+
// nulls have been appended before the first value, and prefix this
168+
// newly initialized array with that number of nulls
169+
// https://github.com/open-telemetry/otel-arrow/issues/534
170+
self.inner = Some(Self::initial_builder(&self.dictionary_options));
171+
}
172+
173+
let inner = self
174+
.inner
175+
.as_mut()
176+
.expect("inner should now be initialized");
177+
inner.append_value(value)
178+
}
179+
180+
fn finish(&mut self) -> Option<ArrayWithType> {
181+
self.inner.as_mut().map(|builder| builder.finish())
182+
}
183+
}
184+
185+
pub type StringArrayBuilder = AdaptiveArrayBuilder<
186+
StringBuilder,
187+
StringDictionaryBuilder<UInt8Type>,
188+
StringDictionaryBuilder<UInt16Type>,
189+
>;
190+
191+
#[cfg(test)]
192+
pub mod test {
193+
use super::*;
194+
195+
use std::sync::Arc;
196+
197+
use arrow::array::{StringArray, UInt8Array, UInt8DictionaryArray};
198+
use arrow::datatypes::DataType;
199+
200+
#[test]
201+
fn test_array_builder() {
202+
let mut builder = StringArrayBuilder::new(ArrayOptions {
203+
nullable: true,
204+
dictionary_options: Some(DictionaryOptions {
205+
min_cardinality: 4,
206+
max_cardinality: 4,
207+
}),
208+
});
209+
210+
// expect that for empty array, we get a None value because the builder is nullable
211+
let result = builder.finish();
212+
assert!(result.is_none());
213+
214+
// expect that when we add values, we get a dictionary
215+
builder.append_value(&"a".to_string());
216+
builder.append_value(&"a".to_string());
217+
builder.append_value(&"b".to_string());
218+
219+
let result = builder.finish().unwrap();
220+
assert_eq!(
221+
result.data_type,
222+
DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8))
223+
);
224+
225+
let mut expected_dict_values = StringBuilder::new();
226+
expected_dict_values.append_value("a");
227+
expected_dict_values.append_value("b");
228+
let expected_dict_keys = UInt8Array::from_iter_values(vec![0, 0, 1]);
229+
let expected =
230+
UInt8DictionaryArray::new(expected_dict_keys, Arc::new(expected_dict_values.finish()));
231+
232+
assert_eq!(
233+
result
234+
.array
235+
.as_any()
236+
.downcast_ref::<UInt8DictionaryArray>()
237+
.unwrap(),
238+
&expected
239+
);
240+
}
241+
242+
#[test]
243+
fn test_array_builder_non_nullable_empty() {
244+
let mut builder = StringArrayBuilder::new(ArrayOptions {
245+
nullable: false,
246+
dictionary_options: Some(DictionaryOptions {
247+
min_cardinality: 4,
248+
max_cardinality: 4,
249+
}),
250+
});
251+
252+
// check that since the type we're building is not nullable, we get an empty array
253+
let result = builder.finish().unwrap();
254+
assert_eq!(
255+
result.data_type,
256+
DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8))
257+
);
258+
assert_eq!(result.array.len(), 0);
259+
}
260+
261+
#[test]
262+
fn test_array_builder_dict_overflow() {
263+
let mut builder = StringArrayBuilder::new(ArrayOptions {
264+
nullable: false,
265+
dictionary_options: Some(DictionaryOptions {
266+
min_cardinality: 4,
267+
max_cardinality: 4,
268+
}),
269+
});
270+
271+
// expect that when we add values, we get a dictionary
272+
builder.append_value(&"a".to_string());
273+
builder.append_value(&"b".to_string());
274+
builder.append_value(&"c".to_string());
275+
builder.append_value(&"d".to_string());
276+
builder.append_value(&"e".to_string());
277+
278+
let result = builder.finish().unwrap();
279+
assert_eq!(result.data_type, DataType::Utf8);
280+
281+
let mut expected_values = StringBuilder::new();
282+
expected_values.append_value("a");
283+
expected_values.append_value("b");
284+
expected_values.append_value("c");
285+
expected_values.append_value("d");
286+
expected_values.append_value("e");
287+
288+
assert_eq!(
289+
result.array.as_any().downcast_ref::<StringArray>().unwrap(),
290+
&expected_values.finish()
291+
)
292+
}
293+
}

0 commit comments

Comments
 (0)