manojkarthick · CalderWhite · Nov 17, 2023 · Nov 20, 2023 · Nov 20, 2023 · manojkarthick
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,8 +22,14 @@ rand = "0.8.5"
 walkdir = "2.3.2"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
+# Performance opts
+jemallocator = "0.5.0"
+
 
 [dev-dependencies]
 assert_cmd = "2.0.4"
 predicates = "2.1.1"
 tempfile = "3.3.0"
+
+[profile.release]
+lto = true
diff --git a/src/commands/merge.rs b/src/commands/merge.rs
@@ -3,6 +3,12 @@ use crate::errors::PQRSError::{FileExists, FileNotFound};
 use crate::utils::{check_path_present, get_row_batches, open_file, write_parquet};
 use clap::Parser;
 use log::debug;
+use parquet::basic::{BrotliLevel, Compression, Encoding, GzipLevel, ZstdLevel};
+use parquet::file::properties::{WriterProperties, WriterVersion};
+use parquet::schema::types::ColumnPath;
+use serde::Deserialize;
+use std::collections::HashMap;
+use std::fs;
 use std::ops::Add;
 use std::path::PathBuf;
 
@@ -16,12 +22,115 @@ pub struct MergeCommandArgs {
     /// Parquet file to write
     #[arg(short, long)]
     output: PathBuf,
+
+    /// Path to a json config file specifying WriterProperties::builder() properties.
+    #[arg(short, long)]
+    config: Option<PathBuf>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct MergeConfig {
+    pub set_dictionary_enabled: Option<bool>,
+    /// The encodings for this are the just text values of the enum parquet::basic::Encoding
+    pub column_encodings: Option<HashMap<String, String>>,
+    pub column_dictionary_enabled: Option<HashMap<String, bool>>,
+    pub compression: Option<String>,
+    pub compression_level: Option<u32>,
+}
+
+fn build_encoding_mappings() -> HashMap<&'static str, Encoding> {
+    HashMap::from([
+        ("PLAIN", Encoding::PLAIN),
+        ("PLAIN_DICTIONARY", Encoding::PLAIN_DICTIONARY),
+        ("RLE", Encoding::RLE),
+        ("BIT_PACKED", Encoding::BIT_PACKED),
+        ("DELTA_BINARY_PACKED", Encoding::DELTA_BINARY_PACKED),
+        ("DELTA_LENGTH_BYTE_ARRAY", Encoding::DELTA_LENGTH_BYTE_ARRAY),
+        ("DELTA_BYTE_ARRAY", Encoding::DELTA_BYTE_ARRAY),
+        ("RLE_DICTIONARY", Encoding::RLE_DICTIONARY),
+        ("BYTE_STREAM_SPLIT", Encoding::BYTE_STREAM_SPLIT),
+    ])
+}
+
+fn build_props_from_json_config(
+    config_path: PathBuf,
+) -> Result<WriterProperties, PQRSError> {
+    let data = fs::read_to_string(config_path)?;
+    let merge_config: MergeConfig = serde_json::from_str(&data)?;
+    let mut props =
+        WriterProperties::builder().set_writer_version(WriterVersion::PARQUET_2_0);
+
+    if let Some(de) = merge_config.set_dictionary_enabled {
+        props = props.set_dictionary_enabled(de);
+    }
+
+    if let Some(column_encodings) = merge_config.column_encodings {
+        let encoding_mappings = build_encoding_mappings();
+        for (column_name, encoding_str) in column_encodings {
+            if !encoding_mappings.contains_key(encoding_str.as_str()) {
+                return Err(PQRSError::IllegalEncodingType());
+            }
+
+            let encoding = *encoding_mappings
+                .get(encoding_str.clone().as_str())
+                .unwrap();
+            props = props.set_column_encoding(ColumnPath::from(column_name), encoding)
+        }
+    }
+
+    if let Some(column_de) = merge_config.column_dictionary_enabled {
+        for (column_name, de) in column_de {
+            println!("{column_name}");
+            props =
+                props.set_column_dictionary_enabled(ColumnPath::from(column_name), de);
+        }
+    }
+
+    if let Some(compression_algo) = merge_config.compression {
+        if compression_algo.to_lowercase() == "brotli" {
+            props = props.set_compression(Compression::BROTLI(
+                BrotliLevel::try_new(
+                    merge_config
+                        .compression_level
+                        .expect("Compression level was not set!"),
+                )
+                .expect("Invalid Brotli level!"),
+            ))
+        } else if compression_algo.to_lowercase() == "gzip" {
+            props = props.set_compression(Compression::GZIP(
+                GzipLevel::try_new(
+                    merge_config
+                        .compression_level
+                        .expect("Compression level was not set!"),
+                )
+                .expect("Invalid GZIP level!"),
+            ))
+        } else if compression_algo.to_lowercase() == "zstd" {
+            props = props.set_compression(Compression::ZSTD(
+                ZstdLevel::try_new(
+                    merge_config
+                        .compression_level
+                        .expect("Compression level was not set!")
+                        as i32,
+                )
+                .expect("Invalid ZSTD level!"),
+            ))
+        }
+    }
+
+    Ok(props.build())
 }
 
 pub(crate) fn execute(opts: MergeCommandArgs) -> Result<(), PQRSError> {
     debug!("The file names to read are: {:?}", opts.input);
     debug!("The file name to write to: {}", opts.output.display());
 
+    let merge_config = if opts.config.is_some() {
+        Some(build_props_from_json_config(opts.config.unwrap())?)
+    } else {
+        None
+    };
+
     // make sure output does not exist already before any reads
     if check_path_present(&opts.output) {
         return Err(FileExists(opts.output));
@@ -43,7 +152,7 @@ pub(crate) fn execute(opts: MergeCommandArgs) -> Result<(), PQRSError> {
     }
     // debug!("The combined data looks like this: {:#?}", combined);
     // debug!("This is the input schema: {:#?}", combined.schema);
-    write_parquet(combined, &opts.output)?;
+    write_parquet(combined, &opts.output, merge_config)?;
 
     Ok(())
 }
diff --git a/src/errors.rs b/src/errors.rs
@@ -33,4 +33,6 @@ pub enum PQRSError {
     UTF8ConvertError(#[from] FromUtf8Error),
     #[error("Could not read/write to buffer")]
     BufferWriteError(#[from] IntoInnerError<BufWriter<Vec<u8>>>),
+    #[error("Illegal encoding type")]
+    IllegalEncodingType(),
 }
diff --git a/src/main.rs b/src/main.rs
@@ -6,6 +6,11 @@ mod commands;
 mod errors;
 mod utils;
 
+// use jemalloc for release builds
+extern crate jemallocator;
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
 #[derive(Subcommand, Debug)]
 enum Commands {
     Cat(commands::cat::CatCommandArgs),

diff --git a/src/utils.rs b/src/utils.rs
@@ -3,6 +3,7 @@ use crate::errors::PQRSError::CouldNotOpenFile;
 use arrow::{datatypes::Schema, record_batch::RecordBatch};
 use log::debug;
 use parquet::arrow::{arrow_reader::ArrowReaderBuilder, ArrowWriter};
+use parquet::file::properties::WriterProperties;
 use parquet::file::reader::{FileReader, SerializedFileReader};
 use parquet::record::Row;
 use rand::seq::SliceRandom;
@@ -259,14 +260,16 @@ pub fn get_row_batches(file: File) -> Result<ParquetData, PQRSError> {
 pub fn write_parquet<P: AsRef<Path>>(
     data: ParquetData,
     output: P,
+    props: Option<WriterProperties>,
 ) -> Result<(), PQRSError> {
     let file = File::create(output)?;
     let fields = data.schema.fields().to_vec();
     // the schema from the record batch might not contain the file specific metadata
     // drop the schema to make sure that we don't fail in that case
     let schema_without_metadata = Schema::new(fields);
 
-    let mut writer = ArrowWriter::try_new(file, Arc::new(schema_without_metadata), None)?;
+    let mut writer =
+        ArrowWriter::try_new(file, Arc::new(schema_without_metadata), props)?;
 
     // write record batches one at a time
     // record batches are not combined