Skip to content

Commit c5bd35f

Browse files
Merge pull request #92 from influxdata/crepererum/snappy_compression
feat: support Snappy (de)compression
2 parents df8608c + 3bcc358 commit c5bd35f

File tree

8 files changed

+164
-1
lines changed

8 files changed

+164
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
/target
22
Cargo.lock
33
perf.data*
4+
.vscode/

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ parking_lot = "0.12"
3030
pin-project-lite = "0.2"
3131
rand = "0.8"
3232
rustls = { version = "0.20", optional = true }
33+
snap = { version = "1", optional = true }
3334
thiserror = "1.0"
3435
time = "0.3"
3536
tokio = { version = "1.14", default-features = false, features = ["io-util", "net", "rt", "sync", "time"] }
@@ -55,6 +56,7 @@ default = ["transport-tls"]
5556

5657
compression-gzip = ["flate2"]
5758
compression-lz4 = ["lz4"]
59+
compression-snappy = ["snap"]
5860
fuzzing = []
5961
transport-socks5 = ["async-socks5"]
6062
transport-tls = ["rustls", "tokio-rustls"]

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ For more advanced production and consumption, see [`crate::client::producer`] an
9292

9393
- **`compression-gzip`:** Support compression and decompression of messages using [gzip].
9494
- **`compression-lz4`:** Support compression and decompression of messages using [LZ4].
95+
- **`compression-snappy`:** Support compression and decompression of messages using [Snappy].
9596
- **`fuzzing`:** Exposes some internal data structures so that they can be used by our fuzzers. This is NOT a stable
9697
feature / API!
9798
- **`transport-socks5`:** Allow transport via SOCKS5 proxy.
@@ -257,3 +258,4 @@ e.g. by batching writes to multiple partitions in a single ProduceRequest
257258
[perf]: https://perf.wiki.kernel.org/index.php/Main_Page
258259
[Redpanda]: https://vectorized.io/redpanda
259260
[rustls]: https://github.com/rustls/rustls
261+
[Snappy]: https://github.com/google/snappy

src/client/partition.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ pub enum Compression {
3232
Gzip,
3333
#[cfg(feature = "compression-lz4")]
3434
Lz4,
35+
#[cfg(feature = "compression-snappy")]
36+
Snappy,
3537
}
3638

3739
impl Default for Compression {
@@ -371,6 +373,8 @@ fn build_produce_request(
371373
Compression::Gzip => RecordBatchCompression::Gzip,
372374
#[cfg(feature = "compression-lz4")]
373375
Compression::Lz4 => RecordBatchCompression::Lz4,
376+
#[cfg(feature = "compression-snappy")]
377+
Compression::Snappy => RecordBatchCompression::Snappy,
374378
},
375379
timestamp_type: RecordBatchTimestampType::CreateTime,
376380
producer_id: -1,

src/protocol/record.rs

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ use std::io::{Cursor, Read, Write};
2424
#[cfg(test)]
2525
use proptest::prelude::*;
2626

27+
use crate::protocol::vec_builder::DEFAULT_BLOCK_SIZE;
28+
2729
use super::{
2830
primitives::{Int16, Int32, Int64, Int8, Varint, Varlong},
2931
traits::{ReadError, ReadType, WriteError, WriteType},
@@ -571,6 +573,7 @@ where
571573
#[cfg(feature = "compression-gzip")]
572574
RecordBatchCompression::Gzip => {
573575
use flate2::read::GzDecoder;
576+
574577
let mut decoder = GzDecoder::new(reader);
575578
let records = Self::read_records(&mut decoder, is_control, n_records)?;
576579

@@ -581,6 +584,7 @@ where
581584
#[cfg(feature = "compression-lz4")]
582585
RecordBatchCompression::Lz4 => {
583586
use lz4::Decoder;
587+
584588
let mut decoder = Decoder::new(reader)?;
585589
let records = Self::read_records(&mut decoder, is_control, n_records)?;
586590

@@ -592,6 +596,61 @@ where
592596

593597
records
594598
}
599+
#[cfg(feature = "compression-snappy")]
600+
RecordBatchCompression::Snappy => {
601+
use snap::raw::{decompress_len, Decoder};
602+
603+
// Construct the input for the raw decoder.
604+
let mut input = vec![];
605+
reader.read_to_end(&mut input)?;
606+
607+
// The snappy compression used here is unframed aka "raw". So we first need to figure out the
608+
// uncompressed length. See
609+
//
610+
// - https://github.com/edenhill/librdkafka/blob/2b76b65212e5efda213961d5f84e565038036270/src/rdkafka_msgset_reader.c#L345-L348
611+
// - https://github.com/edenhill/librdkafka/blob/747f77c98fbddf7dc6508f76398e0fc9ee91450f/src/snappy.c#L779
612+
let uncompressed_size = decompress_len(&input).unwrap();
613+
614+
// Decode snappy payload.
615+
// The uncompressed length is unchecked and can be up to 2^32-1 bytes. To avoid a DDoS vector we try to
616+
// limit it to a small size and if that fails we double that size;
617+
let mut max_uncompressed_size = DEFAULT_BLOCK_SIZE;
618+
let output = loop {
619+
let try_uncompressed_size = uncompressed_size.min(max_uncompressed_size);
620+
621+
let mut decoder = Decoder::new();
622+
let mut output = vec![0; try_uncompressed_size];
623+
let actual_uncompressed_size = match decoder.decompress(&input, &mut output) {
624+
Ok(size) => size,
625+
Err(snap::Error::BufferTooSmall { .. })
626+
if max_uncompressed_size < uncompressed_size =>
627+
{
628+
// try larger buffer
629+
max_uncompressed_size *= 2;
630+
continue;
631+
}
632+
Err(e) => {
633+
return Err(ReadError::Malformed(Box::new(e)));
634+
}
635+
};
636+
if actual_uncompressed_size != uncompressed_size {
637+
return Err(ReadError::Malformed(
638+
"broken snappy data".to_string().into(),
639+
));
640+
}
641+
642+
break output;
643+
};
644+
645+
// Read uncompressed records.
646+
let mut decoder = Cursor::new(output);
647+
let records = Self::read_records(&mut decoder, is_control, n_records)?;
648+
649+
// Check that there's no data left within the uncompressed block.
650+
ensure_eof(&mut decoder, "Data left in Snappy block")?;
651+
652+
records
653+
}
595654
#[allow(unreachable_patterns)]
596655
_ => {
597656
return Err(ReadError::Malformed(
@@ -728,13 +787,15 @@ where
728787
#[cfg(feature = "compression-gzip")]
729788
RecordBatchCompression::Gzip => {
730789
use flate2::{write::GzEncoder, Compression};
790+
731791
let mut encoder = GzEncoder::new(writer, Compression::default());
732792
Self::write_records(&mut encoder, self.records)?;
733793
encoder.finish()?;
734794
}
735795
#[cfg(feature = "compression-lz4")]
736796
RecordBatchCompression::Lz4 => {
737797
use lz4::{liblz4::BlockMode, EncoderBuilder};
798+
738799
let mut encoder = EncoderBuilder::new()
739800
.block_mode(
740801
// the only one supported by Kafka
@@ -745,6 +806,21 @@ where
745806
let (_writer, res) = encoder.finish();
746807
res?;
747808
}
809+
#[cfg(feature = "compression-snappy")]
810+
RecordBatchCompression::Snappy => {
811+
use snap::raw::{max_compress_len, Encoder};
812+
813+
let mut input = vec![];
814+
Self::write_records(&mut input, self.records)?;
815+
816+
let mut encoder = Encoder::new();
817+
let mut output = vec![0; max_compress_len(input.len())];
818+
let len = encoder
819+
.compress(&input, &mut output)
820+
.map_err(|e| WriteError::Malformed(Box::new(e)))?;
821+
822+
writer.write_all(&output[..len])?;
823+
}
748824
#[allow(unreachable_patterns)]
749825
_ => {
750826
return Err(WriteError::Malformed(
@@ -965,4 +1041,54 @@ mod tests {
9651041
let actual2 = RecordBatch::read(&mut Cursor::new(data2)).unwrap();
9661042
assert_eq!(actual2, expected);
9671043
}
1044+
1045+
#[cfg(feature = "compression-snappy")]
1046+
#[test]
1047+
fn test_decode_fixture_snappy() {
1048+
// This data was obtained by watching rdkafka.
1049+
let data = [
1050+
b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x58\x00\x00\x00\x00".to_vec(),
1051+
b"\x02\xad\x86\xf4\xf4\x00\x02\x00\x00\x00\x00\x00\x00\x01\x7e\xb6".to_vec(),
1052+
b"\x45\x0e\x52\x00\x00\x01\x7e\xb6\x45\x0e\x52\xff\xff\xff\xff\xff".to_vec(),
1053+
b"\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x01\x80\x01\x1c".to_vec(),
1054+
b"\xfc\x01\x00\x00\x00\xc8\x01\x78\xfe\x01\x00\x8a\x01\x00\x50\x16".to_vec(),
1055+
b"\x68\x65\x6c\x6c\x6f\x20\x6b\x61\x66\x6b\x61\x02\x06\x66\x6f\x6f".to_vec(),
1056+
b"\x06\x62\x61\x72".to_vec(),
1057+
]
1058+
.concat();
1059+
1060+
let actual = RecordBatch::read(&mut Cursor::new(data)).unwrap();
1061+
let expected = RecordBatch {
1062+
base_offset: 0,
1063+
partition_leader_epoch: 0,
1064+
last_offset_delta: 0,
1065+
first_timestamp: 1643735486034,
1066+
max_timestamp: 1643735486034,
1067+
producer_id: -1,
1068+
producer_epoch: -1,
1069+
base_sequence: -1,
1070+
records: ControlBatchOrRecords::Records(vec![Record {
1071+
timestamp_delta: 0,
1072+
offset_delta: 0,
1073+
key: vec![b'x'; 100],
1074+
value: b"hello kafka".to_vec(),
1075+
headers: vec![RecordHeader {
1076+
key: "foo".to_owned(),
1077+
value: b"bar".to_vec(),
1078+
}],
1079+
}]),
1080+
compression: RecordBatchCompression::Snappy,
1081+
is_transactional: false,
1082+
timestamp_type: RecordBatchTimestampType::CreateTime,
1083+
};
1084+
assert_eq!(actual, expected);
1085+
1086+
let mut data2 = vec![];
1087+
actual.write(&mut data2).unwrap();
1088+
1089+
// don't compare if the data is equal because compression encoder might work slightly differently, use another
1090+
// roundtrip instead
1091+
let actual2 = RecordBatch::read(&mut Cursor::new(data2)).unwrap();
1092+
assert_eq!(actual2, expected);
1093+
}
9681094
}

src/protocol/vec_builder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! Helper to build a vector w/o blowing up memory.
22
33
/// Default block size (10MB).
4-
const DEFAULT_BLOCK_SIZE: usize = 1024 * 1024 * 10;
4+
pub const DEFAULT_BLOCK_SIZE: usize = 1024 * 1024 * 10;
55

66
/// Helper to build a vector w/ limited memory consumption.
77
///

tests/produce_consume.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,30 @@ async fn test_produce_rskafka_consume_rskafka_lz4() {
8181
assert_produce_consume(produce_rskafka, consume_rskafka, Compression::Lz4).await;
8282
}
8383

84+
#[cfg(feature = "compression-snappy")]
85+
#[tokio::test]
86+
async fn test_produce_rdkafka_consume_rdkafka_snappy() {
87+
assert_produce_consume(produce_rdkafka, consume_rdkafka, Compression::Snappy).await;
88+
}
89+
90+
#[cfg(feature = "compression-snappy")]
91+
#[tokio::test]
92+
async fn test_produce_rskafka_consume_rdkafka_snappy() {
93+
assert_produce_consume(produce_rskafka, consume_rdkafka, Compression::Snappy).await;
94+
}
95+
96+
#[cfg(feature = "compression-snappy")]
97+
#[tokio::test]
98+
async fn test_produce_rdkafka_consume_rskafka_snappy() {
99+
assert_produce_consume(produce_rdkafka, consume_rskafka, Compression::Snappy).await;
100+
}
101+
102+
#[cfg(feature = "compression-snappy")]
103+
#[tokio::test]
104+
async fn test_produce_rskafka_consume_rskafka_snappy() {
105+
assert_produce_consume(produce_rskafka, consume_rskafka, Compression::Snappy).await;
106+
}
107+
84108
async fn assert_produce_consume<F1, G1, F2, G2>(
85109
f_produce: F1,
86110
f_consume: F2,

tests/rdkafka_helper.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ pub async fn produce(
3333
Compression::Lz4 => {
3434
cfg.set("compression.codec", "lz4");
3535
}
36+
#[cfg(feature = "compression-snappy")]
37+
Compression::Snappy => {
38+
cfg.set("compression.codec", "snappy");
39+
}
3640
}
3741
let client: FutureProducer<_> = cfg.create().unwrap();
3842

0 commit comments

Comments
 (0)