Skip to content

Commit b03198c

Browse files
Merge pull request #13 from aertslab/sample_id_option
Add option to prefix cell barcode with sample id.
2 parents 26396f1 + 11c9e8f commit b03198c

File tree

6 files changed

+125
-7
lines changed

6 files changed

+125
-7
lines changed

rust/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use std::collections::HashMap;
1515
/// * `cell_type_to_cell_barcodes` - A HashMap mapping cell types to cell barcodes.
1616
/// * `chromsizes` - A HashMap mapping chromosome names to chromosome sizes.
1717
/// * `verbose` - Whether to print progress messages.
18+
/// * `cb_prefix` - Prefix added to each cell barcode
1819
///
1920
/// # Example
2021
///
@@ -42,6 +43,7 @@ fn split_fragments_by_cell_barcode(
4243
cell_type_to_cell_barcodes: HashMap<String, Vec<String>>,
4344
chromsizes: HashMap<String, u64>,
4445
verbose: bool,
46+
cb_prefix: String
4547
) -> PyResult<()> {
4648
// Invert cell_type_to_cell_barcodes
4749
let mut cell_barcode_to_cell_type: HashMap<String, Vec<String>> = HashMap::new();
@@ -60,6 +62,7 @@ fn split_fragments_by_cell_barcode(
6062
chromsizes,
6163
5,
6264
verbose,
65+
cb_prefix
6366
);
6467
Ok(())
6568
}

rust/src/split_fragments.rs

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use rust_htslib::bgzf::Writer;
33
use rust_htslib::tbx::{self, Read as TbxRead};
44
use rust_htslib::tpool::ThreadPool;
55
use std::collections::HashMap;
6+
use std::default;
67
/// Splits a tabix-index fragment file into multiple files based on cell type.
78
use std::io::Write;
89

@@ -67,6 +68,17 @@ fn sanitize_string_for_filename(s: String) -> String {
6768
s.replace([' ', '/'], "_")
6869
}
6970

71+
fn read_to_array(read_bytes: &[u8], a: &mut [String; 5]) {
72+
let read_as_str = String::from_utf8(read_bytes.to_vec())
73+
.expect("Invalid UTF-8 sequence when parsing line");
74+
for (i, s) in read_as_str.split("\t").enumerate() {
75+
if i > (a.len() - 1) {
76+
panic!("Fragment contains more than 5 columns: {}", read_as_str);
77+
}
78+
a[i] = s.to_string();
79+
}
80+
}
81+
7082
/// Splits a tabix-index fragment file into multiple files based on cell type.
7183
///
7284
/// # Arguments
@@ -79,6 +91,7 @@ fn sanitize_string_for_filename(s: String) -> String {
7991
/// * `chromsizes` - A HashMap mapping contig names to contig sizes.
8092
/// * `number_of_threads` - Number of threads to use for writing.
8193
/// * `verbose` - Whether to print progress messages.
94+
/// * `cb_prefix` - Prefix added to each cell barcode
8295
8396
pub fn split_fragments_by_cell_barcode(
8497
path_to_fragments: &String,
@@ -87,6 +100,7 @@ pub fn split_fragments_by_cell_barcode(
87100
chromsizes: HashMap<String, u64>,
88101
number_of_threads: u32,
89102
verbose: bool,
103+
cb_prefix: String
90104
) {
91105
// Initialize reader
92106
let mut tbx_reader = tbx::Reader::from_path(path_to_fragments)
@@ -148,15 +162,17 @@ pub fn split_fragments_by_cell_barcode(
148162
let mut not_at_end = tbx_reader
149163
.read(&mut read)
150164
.unwrap_or_else(|_| panic!("Could not read from fragments file"));
151-
let mut read_as_str = String::from_utf8(read.clone()).unwrap();
152-
165+
let mut read_a: [String; 5] = Default::default();
166+
read_to_array(read.as_slice(), &mut read_a);
153167
// loop over reads
154168
while not_at_end {
155-
let read_cb = read_as_str.split('\t').nth(3).unwrap().to_string();
156-
if let Some(cell_types) = cell_barcode_to_cell_type.get(&read_cb) {
169+
let read_cb = read_a[3].as_str();
170+
if let Some(cell_types) = cell_barcode_to_cell_type.get(read_cb) {
171+
// add sample id prefix to cell barcode
172+
read_a[3].insert_str(0, &cb_prefix);
157173
for cell_type in cell_types {
158174
let writer = cell_type_to_writer.get_mut(cell_type).unwrap();
159-
writer.write_all(&read).unwrap_or_else(|_| {
175+
writer.write_all(&read_a.join("\t").as_bytes()).unwrap_or_else(|_| {
160176
panic!(
161177
"Could not write contig \"{}\" to \"{}\" fragments file",
162178
contig, &writer.path
@@ -172,7 +188,7 @@ pub fn split_fragments_by_cell_barcode(
172188
}
173189
read.clear();
174190
not_at_end = tbx_reader.read(&mut read).unwrap();
175-
read_as_str = String::from_utf8(read.clone()).unwrap();
191+
read_to_array(read.as_slice(), &mut read_a);
176192
}
177193

178194
// flush buffers

src/scatac_fragment_tools/cli/commands.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ def command_split_fragments_by_cell_type(args):
8282
Column name for the cell type
8383
args.cell_barcode_column_name: str
8484
Column name for the cell barcode
85+
args.add_sample_id: bool
86+
Flag specifying wether or not to prefix the cell barcode with the sample id.
8587
"""
8688
# Check arguments before doing anything else.
8789
import os
@@ -192,4 +194,5 @@ def command_split_fragments_by_cell_type(args):
192194
n_cpu=args.n_cpu,
193195
verbose=args.verbose,
194196
clear_temp_folder=args.clear_temp_folder,
197+
add_sample_id=args.add_sample_id
195198
)

src/scatac_fragment_tools/cli/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,13 @@ def add_split_fragments_by_cell_type_subparser(
273273
default="cell_barcode",
274274
help="Column name for the cell barcode",
275275
)
276+
parser.add_optional_argument(
277+
"--add_sample_id",
278+
dest="add_sample_id",
279+
action="store_true",
280+
default=False,
281+
help="Prefix sample id to cell barcode in pseudobulk fragment file"
282+
)
276283
return parser.get_parser()
277284

278285

src/scatac_fragment_tools/library/split/split_fragments_by_cell_type.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def split_fragment_files_by_cell_type(
2323
n_cpu: int = 1,
2424
verbose: bool = False,
2525
clear_temp_folder: bool = False,
26+
add_sample_id: bool = False
2627
):
2728
"""
2829
Split fragment files by cell type.
@@ -47,6 +48,8 @@ def split_fragment_files_by_cell_type(
4748
Whether to print progress. The default is False.
4849
clear_temp_folder : bool, optional
4950
Whether to clear the temporary folder. The default is False.
51+
add_sample_id: bool, optional
52+
Whether or not to prefix the cell barcode with the sample id.
5053
"""
5154
# Check whether the same samples were provided in sample_to_fragment_file and
5255
# sample_to_cell_type_to_cell_barcodes.
@@ -83,6 +86,7 @@ def split_fragment_files_by_cell_type(
8386
cell_type_to_cell_barcodes=sample_to_cell_type_to_cell_barcodes[sample],
8487
chromsizes=chromsizes,
8588
verbose=verbose,
89+
cb_prefix=f"{sample}_" if add_sample_id else ""
8690
)
8791
for sample in sample_to_cell_type_to_cell_barcodes
8892
)

tests/split/test_split.py

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ def split_command_test_helper(tmp_path, file_dict):
8484
cell_type = row[0]
8585
cell_type_bcs = cell_annotations \
8686
.filter(pl.col("cell_type") == cell_type) \
87-
.select(pl.col("cell_barcode"))
87+
.get_column("cell_barcode") \
88+
.to_list()
8889
fragments_cell_type = pl.concat(
8990
[
9091
a_fragments.filter(pl.col("column_4").is_in(cell_type_bcs)),
@@ -101,10 +102,94 @@ def split_command_test_helper(tmp_path, file_dict):
101102
generated_fragments_cell_type
102103
)
103104

105+
def run_split_command_w_sample_id(tmp_path, output_folder, file_dict):
106+
path_to_a_fragments = os.path.join(TEST_DIRECTORY, file_dict["a.fragments"])
107+
path_to_a_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["a.fragment_index"])
108+
path_to_b_fragments = os.path.join(TEST_DIRECTORY, file_dict["b.fragments"])
109+
path_to_b_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["b.fragment_index"])
110+
path_to_sample_to_fragment = os.path.join(TEST_DIRECTORY, file_dict["sample_to_fragment"])
111+
path_to_cell_type_annotation = os.path.join(TEST_DIRECTORY, file_dict["cell_type_annotation"])
112+
path_to_chrom_sizes = os.path.join(TEST_DIRECTORY, file_dict["chrom_sizes"])
113+
os.system(f"cp {path_to_a_fragments} {tmp_path}")
114+
os.system(f"cp {path_to_a_fragment_index} {tmp_path}")
115+
os.system(f"cp {path_to_b_fragments} {tmp_path}")
116+
os.system(f"cp {path_to_b_fragment_index} {tmp_path}")
117+
os.system(f"cp {path_to_sample_to_fragment} {tmp_path}")
118+
os.system(f"cp {path_to_cell_type_annotation} {tmp_path}")
119+
os.system(f"cp {path_to_chrom_sizes} {tmp_path}")
120+
121+
COMMAND = f"""cd {tmp_path} && \
122+
scatac_fragment_tools split \
123+
-f {path_to_sample_to_fragment} \
124+
-b {path_to_cell_type_annotation} \
125+
-c {path_to_chrom_sizes} \
126+
-o {output_folder} \
127+
-t {tmp_path} \
128+
--add_sample_id
129+
"""
130+
return os.system(COMMAND)
131+
132+
def split_command_test_helper_w_sample_id(tmp_path, file_dict):
133+
output_folder = os.path.join(tmp_path, "output")
134+
os.makedirs(output_folder, exist_ok=True)
135+
exit_status = run_split_command_w_sample_id(tmp_path, output_folder, file_dict)
136+
assert exit_status == 0
137+
138+
a_fragments = pl.read_csv(
139+
TEST_DIRECTORY.joinpath(file_dict["a.fragments"]),
140+
separator = "\t",
141+
has_header = False
142+
)
143+
b_fragments = pl.read_csv(
144+
TEST_DIRECTORY.joinpath(file_dict["b.fragments"]),
145+
separator = "\t",
146+
has_header = False
147+
)
148+
cell_annotations = pl.read_csv(
149+
TEST_DIRECTORY.joinpath(file_dict["cell_type_annotation"]),
150+
separator = "\t"
151+
)
152+
153+
for (cell_type, ) in cell_annotations \
154+
.select([pl.col("cell_type")]) \
155+
.unique() \
156+
.iter_rows():
157+
cell_type_bcs = cell_annotations \
158+
.filter(pl.col("cell_type") == cell_type) \
159+
.get_column("cell_barcode") \
160+
.to_list()
161+
fragments_cell_type = pl.concat(
162+
[
163+
a_fragments.filter(
164+
pl.col("column_4").is_in(cell_type_bcs)
165+
).with_columns(
166+
(pl.lit("A_") + pl.col("column_4")).alias("column_4")
167+
),
168+
b_fragments.filter(pl.col("column_4").is_in(cell_type_bcs)
169+
).with_columns(
170+
(pl.lit("B_") + pl.col("column_4")).alias("column_4")
171+
)
172+
]
173+
).sort(by=["column_1", "column_2", "column_3", "column_4"])
174+
generated_fragments_cell_type = pl.read_csv(
175+
os.path.join(output_folder, f"{cell_type}.fragments.tsv.gz"),
176+
separator ="\t",
177+
has_header=False
178+
).sort(by=["column_1", "column_2", "column_3", "column_4"])
179+
assert_frame_equal(
180+
fragments_cell_type,
181+
generated_fragments_cell_type
182+
)
104183

105184
def test_split_command_bc_single_type(tmp_path):
106185
split_command_test_helper(tmp_path, FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE)
107186

108187
def test_split_command_barcode_mapping_multiple_types(tmp_path):
109188
split_command_test_helper(tmp_path, FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES)
110189

190+
def test_split_command_w_sample_id_bc_single_type(tmp_path):
191+
split_command_test_helper_w_sample_id(tmp_path, FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE)
192+
193+
def test_split_command_w_sample_id_barcode_mapping_multiple_types(tmp_path):
194+
split_command_test_helper_w_sample_id(tmp_path, FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES)
195+

0 commit comments

Comments
 (0)