-
Notifications
You must be signed in to change notification settings - Fork 1.9k
enhancement(clickhouse sink): Add ArrowStream format
#24373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 2 commits
42c4fb5
a9a67c2
e7b89c0
9e93926
85d7aa3
4e101ac
4211ce3
a4e634b
1826aa3
1db0f07
e1b22a1
5525afb
834ce89
aa69c1c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| The `clickhouse` sink now supports the `arrow_stream` format option, enabling high-performance binary data transfer using Apache Arrow IPC. This provides significantly better performance and smaller payload sizes compared to JSON-based formats. | ||
|
|
||
| authors: benjamin-awd |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ use arrow::{ | |
| ipc::writer::StreamWriter, | ||
| record_batch::RecordBatch, | ||
| }; | ||
| use async_trait::async_trait; | ||
| use bytes::{BufMut, Bytes, BytesMut}; | ||
| use chrono::{DateTime, Utc}; | ||
| use rust_decimal::Decimal; | ||
|
|
@@ -25,6 +26,15 @@ use vector_config::configurable_component; | |
|
|
||
| use vector_core::event::{Event, Value}; | ||
|
|
||
| /// Provides Arrow schema for encoding. | ||
| /// | ||
| /// Sinks can implement this trait to provide custom schema fetching logic. | ||
| #[async_trait] | ||
| pub trait SchemaProvider: Send + Sync + std::fmt::Debug { | ||
| /// Get the Arrow schema for encoding events. | ||
| async fn get_schema(&self) -> Result<Arc<Schema>, ArrowEncodingError>; | ||
| } | ||
|
|
||
| /// Configuration for Arrow IPC stream serialization | ||
| #[configurable_component] | ||
| #[derive(Clone, Default)] | ||
|
|
@@ -45,6 +55,10 @@ pub struct ArrowStreamSerializerConfig { | |
| #[serde(default)] | ||
| #[configurable(metadata(docs::examples = true))] | ||
| pub allow_nullable_fields: bool, | ||
|
|
||
| /// Schema provider for lazy schema loading. | ||
|
||
| #[serde(skip)] | ||
| schema_provider: Option<Arc<dyn SchemaProvider>>, | ||
|
||
| } | ||
|
|
||
| impl std::fmt::Debug for ArrowStreamSerializerConfig { | ||
|
|
@@ -58,6 +72,10 @@ impl std::fmt::Debug for ArrowStreamSerializerConfig { | |
| .map(|s| format!("{} fields", s.fields().len())), | ||
| ) | ||
| .field("allow_nullable_fields", &self.allow_nullable_fields) | ||
| .field( | ||
| "schema_provider", | ||
| &self.schema_provider.as_ref().map(|_| "<provider>"), | ||
| ) | ||
| .finish() | ||
| } | ||
| } | ||
|
|
@@ -68,6 +86,38 @@ impl ArrowStreamSerializerConfig { | |
| Self { | ||
| schema: Some(schema), | ||
| allow_nullable_fields: false, | ||
| schema_provider: None, | ||
| } | ||
| } | ||
|
|
||
| /// Create a new ArrowStreamSerializerConfig with a schema provider | ||
| pub fn with_provider(provider: Arc<dyn SchemaProvider>) -> Self { | ||
| Self { | ||
| schema: None, | ||
| schema_provider: Some(provider), | ||
| allow_nullable_fields: false, | ||
| } | ||
| } | ||
|
|
||
| /// Get the schema provider if one was configured | ||
| pub fn provider(&self) -> Option<&Arc<dyn SchemaProvider>> { | ||
| self.schema_provider.as_ref() | ||
| } | ||
|
|
||
| /// Resolve the schema from the provider if present. | ||
| pub async fn resolve(&mut self) -> Result<(), ArrowEncodingError> { | ||
| // If schema already exists, nothing to do | ||
| if self.schema.is_some() { | ||
| return Ok(()); | ||
| } | ||
|
|
||
| // Fetch from provider if available | ||
| if let Some(provider) = &self.schema_provider { | ||
| let schema = provider.get_schema().await?; | ||
| self.schema = Some(schema); | ||
| Ok(()) | ||
| } else { | ||
| Err(ArrowEncodingError::NoSchemaProvided) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -154,6 +204,13 @@ pub enum ArrowEncodingError { | |
| #[snafu(display("Schema must be provided before encoding"))] | ||
| NoSchemaProvided, | ||
|
|
||
| /// Failed to fetch schema from provider | ||
| #[snafu(display("Failed to fetch schema from provider: {}", message))] | ||
| SchemaFetchError { | ||
| /// Error message from the provider | ||
| message: String, | ||
| }, | ||
|
|
||
| /// Unsupported Arrow data type for field | ||
| #[snafu(display( | ||
| "Unsupported Arrow data type for field '{}': {:?}", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like
rust_decimalis already using this version inlib/codecs/Cargo.toml. We should update the workspace reference to use1.37.0and keep using workspace here. Additionallylib/codecs/Cargo.tomlshould also use theworkspaceversion ofrust_decimalThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changes made in e7b89c0