Merge pull request #13 from aertslab/sample_id_option

SeppeDeWinter · web-flow · commit b03198ce910f · 2025-06-11T07:41:02.000+02:00
Add option to prefix cell barcode with sample id.
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
@@ -15,6 +15,7 @@ use std::collections::HashMap;
 /// * `cell_type_to_cell_barcodes` - A HashMap mapping cell types to cell barcodes.
 /// * `chromsizes` - A HashMap mapping chromosome names to chromosome sizes.
 /// * `verbose` - Whether to print progress messages.
+/// * `cb_prefix` - Prefix added to each cell barcode
 ///
 /// # Example
 ///
@@ -42,6 +43,7 @@ fn split_fragments_by_cell_barcode(
     cell_type_to_cell_barcodes: HashMap<String, Vec<String>>,
     chromsizes: HashMap<String, u64>,
     verbose: bool,
+    cb_prefix: String
 ) -> PyResult<()> {
     // Invert cell_type_to_cell_barcodes
     let mut cell_barcode_to_cell_type: HashMap<String, Vec<String>> = HashMap::new();
@@ -60,6 +62,7 @@ fn split_fragments_by_cell_barcode(
         chromsizes,
         5,
         verbose,
+        cb_prefix
     );
     Ok(())
 }
diff --git a/rust/src/split_fragments.rs b/rust/src/split_fragments.rs
@@ -3,6 +3,7 @@ use rust_htslib::bgzf::Writer;
 use rust_htslib::tbx::{self, Read as TbxRead};
 use rust_htslib::tpool::ThreadPool;
 use std::collections::HashMap;
+use std::default;
 /// Splits a tabix-index fragment file into multiple files based on cell type.
 use std::io::Write;
 
@@ -67,6 +68,17 @@ fn sanitize_string_for_filename(s: String) -> String {
     s.replace([' ', '/'], "_")
 }
 
+fn read_to_array(read_bytes: &[u8], a: &mut [String; 5]) {
+    let read_as_str = String::from_utf8(read_bytes.to_vec())
+                        .expect("Invalid UTF-8 sequence when parsing line");
+    for (i, s) in read_as_str.split("\t").enumerate() {
+        if i > (a.len() - 1) {
+            panic!("Fragment contains more than 5 columns: {}", read_as_str);
+        }
+        a[i] = s.to_string();
+    }
+}
+
 /// Splits a tabix-index fragment file into multiple files based on cell type.
 ///
 /// # Arguments
@@ -79,6 +91,7 @@ fn sanitize_string_for_filename(s: String) -> String {
 /// * `chromsizes` - A HashMap mapping contig names to contig sizes.
 /// * `number_of_threads` - Number of threads to use for writing.
 /// * `verbose` - Whether to print progress messages.
+/// * `cb_prefix` - Prefix added to each cell barcode
 
 pub fn split_fragments_by_cell_barcode(
     path_to_fragments: &String,
@@ -87,6 +100,7 @@ pub fn split_fragments_by_cell_barcode(
     chromsizes: HashMap<String, u64>,
     number_of_threads: u32,
     verbose: bool,
+    cb_prefix: String
 ) {
     // Initialize reader
     let mut tbx_reader = tbx::Reader::from_path(path_to_fragments)
@@ -148,15 +162,17 @@ pub fn split_fragments_by_cell_barcode(
         let mut not_at_end = tbx_reader
             .read(&mut read)
             .unwrap_or_else(|_| panic!("Could not read from fragments file"));
-        let mut read_as_str = String::from_utf8(read.clone()).unwrap();
-
+        let mut read_a: [String; 5] = Default::default();
+        read_to_array(read.as_slice(), &mut read_a);
         // loop over reads
         while not_at_end {
-            let read_cb = read_as_str.split('\t').nth(3).unwrap().to_string();
-            if let Some(cell_types) = cell_barcode_to_cell_type.get(&read_cb) {
+            let read_cb = read_a[3].as_str();
+            if let Some(cell_types) = cell_barcode_to_cell_type.get(read_cb) {
+                // add sample id prefix to cell barcode
+                read_a[3].insert_str(0, &cb_prefix);
                 for cell_type in cell_types {
                     let writer = cell_type_to_writer.get_mut(cell_type).unwrap();
-                    writer.write_all(&read).unwrap_or_else(|_| {
+                    writer.write_all(&read_a.join("\t").as_bytes()).unwrap_or_else(|_| {
                         panic!(
                             "Could not write contig \"{}\" to \"{}\" fragments file",
                             contig, &writer.path
@@ -172,7 +188,7 @@ pub fn split_fragments_by_cell_barcode(
             }
             read.clear();
             not_at_end = tbx_reader.read(&mut read).unwrap();
-            read_as_str = String::from_utf8(read.clone()).unwrap();
+            read_to_array(read.as_slice(), &mut read_a);
         }
 
         // flush buffers
diff --git a/src/scatac_fragment_tools/cli/commands.py b/src/scatac_fragment_tools/cli/commands.py
@@ -82,6 +82,8 @@ def command_split_fragments_by_cell_type(args):
         Column name for the cell type
     args.cell_barcode_column_name: str
         Column name for the cell barcode
+    args.add_sample_id: bool
+        Flag specifying wether or not to prefix the cell barcode with the sample id.
     """
     # Check arguments before doing anything else.
     import os
@@ -192,4 +194,5 @@ def command_split_fragments_by_cell_type(args):
         n_cpu=args.n_cpu,
         verbose=args.verbose,
         clear_temp_folder=args.clear_temp_folder,
+        add_sample_id=args.add_sample_id
     )
diff --git a/src/scatac_fragment_tools/cli/main.py b/src/scatac_fragment_tools/cli/main.py
@@ -273,6 +273,13 @@ def add_split_fragments_by_cell_type_subparser(
         default="cell_barcode",
         help="Column name for the cell barcode",
     )
+    parser.add_optional_argument(
+        "--add_sample_id",
+        dest="add_sample_id",
+        action="store_true",
+        default=False,
+        help="Prefix sample id to cell barcode in pseudobulk fragment file"
+    )
     return parser.get_parser()
 
 
diff --git a/src/scatac_fragment_tools/library/split/split_fragments_by_cell_type.py b/src/scatac_fragment_tools/library/split/split_fragments_by_cell_type.py
@@ -23,6 +23,7 @@ def split_fragment_files_by_cell_type(
     n_cpu: int = 1,
     verbose: bool = False,
     clear_temp_folder: bool = False,
+    add_sample_id: bool = False
 ):
     """
     Split fragment files by cell type.
@@ -47,6 +48,8 @@ def split_fragment_files_by_cell_type(
         Whether to print progress. The default is False.
     clear_temp_folder : bool, optional
         Whether to clear the temporary folder. The default is False.
+    add_sample_id: bool, optional
+        Whether or not to prefix the cell barcode with the sample id.
     """
     # Check whether the same samples were provided in sample_to_fragment_file and
     # sample_to_cell_type_to_cell_barcodes.
@@ -83,6 +86,7 @@ def split_fragment_files_by_cell_type(
             cell_type_to_cell_barcodes=sample_to_cell_type_to_cell_barcodes[sample],
             chromsizes=chromsizes,
             verbose=verbose,
+            cb_prefix=f"{sample}_" if add_sample_id else ""
         )
         for sample in sample_to_cell_type_to_cell_barcodes
     )
diff --git a/tests/split/test_split.py b/tests/split/test_split.py
@@ -84,7 +84,8 @@ def split_command_test_helper(tmp_path, file_dict):
         cell_type = row[0]
         cell_type_bcs = cell_annotations \
             .filter(pl.col("cell_type") == cell_type) \
-            .select(pl.col("cell_barcode"))
+            .get_column("cell_barcode") \
+            .to_list()
         fragments_cell_type = pl.concat(
             [
                 a_fragments.filter(pl.col("column_4").is_in(cell_type_bcs)),
@@ -101,10 +102,94 @@ def split_command_test_helper(tmp_path, file_dict):
             generated_fragments_cell_type
         )
 
+def run_split_command_w_sample_id(tmp_path, output_folder, file_dict):
+    path_to_a_fragments = os.path.join(TEST_DIRECTORY, file_dict["a.fragments"])
+    path_to_a_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["a.fragment_index"])
+    path_to_b_fragments = os.path.join(TEST_DIRECTORY, file_dict["b.fragments"])
+    path_to_b_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["b.fragment_index"])
+    path_to_sample_to_fragment = os.path.join(TEST_DIRECTORY, file_dict["sample_to_fragment"])
+    path_to_cell_type_annotation = os.path.join(TEST_DIRECTORY, file_dict["cell_type_annotation"])
+    path_to_chrom_sizes = os.path.join(TEST_DIRECTORY, file_dict["chrom_sizes"])
+    os.system(f"cp {path_to_a_fragments} {tmp_path}")
+    os.system(f"cp {path_to_a_fragment_index} {tmp_path}")
+    os.system(f"cp {path_to_b_fragments} {tmp_path}")
+    os.system(f"cp {path_to_b_fragment_index} {tmp_path}")
+    os.system(f"cp {path_to_sample_to_fragment} {tmp_path}")
+    os.system(f"cp {path_to_cell_type_annotation} {tmp_path}")
+    os.system(f"cp {path_to_chrom_sizes} {tmp_path}")
+
+    COMMAND = f"""cd {tmp_path} && \
+    scatac_fragment_tools split \
+        -f {path_to_sample_to_fragment} \
+        -b {path_to_cell_type_annotation} \
+        -c {path_to_chrom_sizes} \
+        -o {output_folder} \
+        -t {tmp_path} \
+        --add_sample_id
+    """
+    return os.system(COMMAND)
+
+def split_command_test_helper_w_sample_id(tmp_path, file_dict):
+    output_folder = os.path.join(tmp_path, "output")
+    os.makedirs(output_folder, exist_ok=True)
+    exit_status = run_split_command_w_sample_id(tmp_path, output_folder, file_dict)
+    assert exit_status == 0
+
+    a_fragments = pl.read_csv(
+        TEST_DIRECTORY.joinpath(file_dict["a.fragments"]),
+        separator = "\t",
+        has_header = False
+    )
+    b_fragments = pl.read_csv(
+        TEST_DIRECTORY.joinpath(file_dict["b.fragments"]),
+        separator = "\t",
+        has_header = False
+    )
+    cell_annotations = pl.read_csv(
+        TEST_DIRECTORY.joinpath(file_dict["cell_type_annotation"]),
+        separator = "\t"
+    )
+
+    for (cell_type, ) in cell_annotations \
+            .select([pl.col("cell_type")]) \
+            .unique() \
+            .iter_rows():
+        cell_type_bcs = cell_annotations \
+            .filter(pl.col("cell_type") == cell_type) \
+            .get_column("cell_barcode") \
+            .to_list()
+        fragments_cell_type = pl.concat(
+            [
+                a_fragments.filter(
+                    pl.col("column_4").is_in(cell_type_bcs)
+                ).with_columns(
+                    (pl.lit("A_") + pl.col("column_4")).alias("column_4")
+                ),
+                b_fragments.filter(pl.col("column_4").is_in(cell_type_bcs)
+                ).with_columns(
+                    (pl.lit("B_") + pl.col("column_4")).alias("column_4")
+                )
+            ]
+        ).sort(by=["column_1", "column_2", "column_3", "column_4"])
+        generated_fragments_cell_type = pl.read_csv(
+            os.path.join(output_folder, f"{cell_type}.fragments.tsv.gz"),
+            separator ="\t",
+            has_header=False
+        ).sort(by=["column_1", "column_2", "column_3", "column_4"])
+        assert_frame_equal(
+            fragments_cell_type,
+            generated_fragments_cell_type
+        )
 
 def test_split_command_bc_single_type(tmp_path):
     split_command_test_helper(tmp_path, FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE)
 
 def test_split_command_barcode_mapping_multiple_types(tmp_path):
     split_command_test_helper(tmp_path, FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES)
 
+def test_split_command_w_sample_id_bc_single_type(tmp_path):
+    split_command_test_helper_w_sample_id(tmp_path, FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE)
+
+def test_split_command_w_sample_id_barcode_mapping_multiple_types(tmp_path):
+    split_command_test_helper_w_sample_id(tmp_path, FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES)
+