Skip to content

Commit ced742e

Browse files
committed
Merge remote-tracking branch 'origin/release/0.3.0' into feature/pcgr_v2.2.1
2 parents acb6862 + d792557 commit ced742e

File tree

15 files changed

+292
-67
lines changed

15 files changed

+292
-67
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ __pycache__/
55
build/
66
venv/
77
working/
8+
data/
9+
workspace/

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,6 @@
44

55
- [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling
66

7-
- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
7+
- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
8+
9+
-[9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling

bolt/common/constants.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
######################################
55
## Variation selection (annotation) ##
66
######################################
7-
MAX_SOMATIC_VARIANTS = 500_000
7+
MAX_SOMATIC_VARIANTS = 450_000
88
MAX_SOMATIC_VARIANTS_GNOMAD_FILTER = 0.01
99

1010

@@ -41,6 +41,43 @@
4141
}
4242

4343

44+
################################
45+
## Hypermutated report filter ##
46+
################################
47+
PCGR_TIERS_FILTERING = (
48+
'TIER_1',
49+
'TIER_2',
50+
'TIER_3',
51+
'TIER_4',
52+
'NONCODING',
53+
)
54+
55+
VEP_IMPACTS_FILTER = (
56+
'intergenic',
57+
'intronic',
58+
'downstream',
59+
'upstream',
60+
'impacts_other',
61+
)
62+
63+
GENOMIC_REGIONS_FILTERING = (
64+
'difficult',
65+
'none',
66+
'giab_conf',
67+
)
68+
69+
HOTSPOT_FIELDS_FILTERING = (
70+
'SAGE_HOTSPOT',
71+
'hotspot',
72+
'PCGR_MUTATION_HOTSPOT',
73+
)
74+
75+
RETAIN_FIELDS_FILTERING = (
76+
'PANEL',
77+
*HOTSPOT_FIELDS_FILTERING,
78+
)
79+
80+
4481
##################################################
4582
## VCF FILTER tags and FORMAT, INFO annotations ##
4683
##################################################

bolt/common/pcgr.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
import re
88
import shutil
99
import tempfile
10+
import logging
1011

1112
import cyvcf2
1213

1314

1415
from .. import util
1516
from ..common import constants
1617

18+
# Use the existing logger configuration
19+
logger = logging.getLogger(__name__)
1720

1821
def prepare_vcf_somatic(input_fp, tumor_name, normal_name, output_dir):
1922

@@ -179,7 +182,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
179182

180183
command = fr'''
181184
pcgr \
182-
{command_args_str}
185+
{command_args_str}
183186
'''
184187

185188
if pcgr_conda:
@@ -262,7 +265,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
262265
return cpsr_output_dir
263266

264267

265-
def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, output_dir):
268+
def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, output_dir):
266269
# Set destination INFO field names and source TSV fields
267270
info_field_map = {
268271
constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT',
@@ -297,10 +300,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
297300
# Do not process chrM since *snvs_indels.tiers.tsv does not include these annotations
298301
if record.CHROM == 'chrM':
299302
continue
300-
# Immediately print out variants that were not annotated
301-
if filter_name in record.FILTERS:
302-
output_fh.write_record(record)
303-
continue
304303
# Annotate and write
305304
record_ann = annotate_record(record, pcgr_data, allow_missing=True)
306305
output_fh.write_record(record_ann)

bolt/logging_config.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import logging
2+
import sys
3+
import pathlib
4+
from datetime import datetime
5+
6+
class IgnoreTinfoFilter(logging.Filter):
7+
def filter(self, record):
8+
# Exclude messages that contain the unwanted text.
9+
if "no version information available" in record.getMessage():
10+
return False
11+
return True
12+
13+
def setup_logging(output_dir, script_name):
14+
# Create a timestamp for the log file
15+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
16+
log_filename = f"{script_name}_{timestamp}.log"
17+
log_file = pathlib.Path(output_dir) / log_filename
18+
19+
# Create individual handlers.
20+
console_handler = logging.StreamHandler(sys.stdout)
21+
file_handler = logging.FileHandler(log_file)
22+
23+
# Instantiate and attach the filter to both handlers.
24+
tinfo_filter = IgnoreTinfoFilter()
25+
console_handler.addFilter(tinfo_filter)
26+
file_handler.addFilter(tinfo_filter)
27+
28+
logging.basicConfig(
29+
level=logging.DEBUG,
30+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
31+
handlers=[file_handler, console_handler]
32+
)
33+
logger = logging.getLogger(__name__)
34+
logger.info("Logging setup complete")

bolt/util.py

Lines changed: 140 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
import gzip
2+
import os
23
import pathlib
34
import subprocess
4-
import sys
55
import textwrap
6-
6+
import logging
7+
from types import SimpleNamespace
78

89
from .common import constants
910

11+
# Set up logging
12+
logger = logging.getLogger(__name__)
1013

1114
# TODO(SW): create note that number this assumes location of `<root>/<package>/<file>`
1215
def get_project_root():
@@ -16,46 +19,58 @@ def get_project_root():
1619
return project_root
1720

1821

19-
def execute_command(command):
20-
command_prepared = command_prepare(command)
22+
def execute_command(command, log_file_path=None):
23+
logger.info("Executing command: %s", command.strip())
2124

22-
print(command_prepared)
25+
# Open the log file if provided
26+
log_file = log_file_path.open('a', encoding='utf-8') if log_file_path else None
2327

24-
process = subprocess.run(
25-
command_prepared,
28+
# Launch process with combined stdout and stderr streams, and line buffering enabled.
29+
process = subprocess.Popen(
30+
command,
2631
shell=True,
2732
executable='/bin/bash',
28-
capture_output=True,
33+
stdout=subprocess.PIPE,
34+
stderr=subprocess.STDOUT,
35+
text=True,
2936
encoding='utf-8',
37+
bufsize=1 # line buffered
3038
)
3139

32-
if process.returncode != 0:
33-
print(process)
34-
print(process.stderr)
35-
sys.exit(1)
36-
37-
return process
40+
output_lines = []
41+
# Iterate over each line as it becomes available
42+
with process.stdout:
43+
for line in iter(process.stdout.readline, ''):
44+
if line:
45+
logger.info(line.strip())
46+
output_lines.append(line)
47+
if log_file:
48+
log_file.write(line)
49+
log_file.flush() # flush immediately for real-time logging
50+
process.wait() # wait for the process to complete
51+
52+
if log_file:
53+
log_file.close()
54+
55+
result = SimpleNamespace(
56+
stdout=''.join(output_lines),
57+
returncode=process.returncode,
58+
pid=process.pid,
59+
command=command
60+
)
3861

62+
return result
3963

4064
def command_prepare(command):
4165
return f'set -o pipefail; {textwrap.dedent(command)}'
4266

43-
44-
#def count_vcf_records(fp, exclude_args=None):
45-
# args = list()
46-
# if exclude_args:
47-
# args.append(f'-e \'{exclude_args}\'')
48-
#
49-
# args_str = ' '.join(args)
50-
# command = f'bcftools view -H {args_str} {fp} | wc -l'
51-
#
52-
# result = execute_command(command)
53-
# return int(result.stdout)
54-
55-
5667
def count_vcf_records(fp):
57-
result = execute_command(f'bcftools view -H {fp} | wc -l')
58-
return int(result.stdout)
68+
result = subprocess.run(f'bcftools view -H {fp} | wc -l',
69+
shell=True,
70+
executable="/bin/bash",
71+
capture_output=True,
72+
text=True )
73+
return int(result.stdout.strip())
5974

6075

6176
def add_vcf_header_entry(fh, anno_enum):
@@ -95,15 +110,63 @@ def get_qualified_vcf_annotation(anno_enum):
95110
assert anno_enum in constants.VcfInfo or anno_enum in constants.VcfFormat
96111
return f'{anno_enum.namespace}/{anno_enum.value}'
97112

113+
def split_vcf(input_vcf, output_dir):
114+
"""
115+
Splits a VCF file into multiple chunks, each containing up to max_variants variants.
116+
Each chunk includes the VCF header.
117+
Ensures no overlapping positions between chunks.
118+
"""
119+
output_dir = pathlib.Path(output_dir / "vcf_chunks")
120+
output_dir.mkdir(parents=True, exist_ok=True)
121+
122+
chunk_files = []
123+
chunk_number = 1
124+
variant_count = 0
125+
base_filename = pathlib.Path(input_vcf).stem
126+
chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
127+
base_filename = input_vcf.stem
128+
chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
129+
chunk_files.append(chunk_filename)
130+
131+
# Open the input VCF using cyvcf2
132+
vcf_in = cyvcf2.VCF(input_vcf)
133+
# Create a new VCF file for the first chunk
134+
vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
135+
136+
last_position = None
137+
138+
for record in vcf_in:
139+
current_position = record.POS
140+
# Check if we need to start a new chunk
141+
if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
142+
# Close the current chunk file and start a new one
143+
vcf_out.close()
144+
chunk_number += 1
145+
chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
146+
chunk_files.append(chunk_filename)
147+
vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
148+
variant_count = 0
149+
150+
# Write the record to the current chunk
151+
vcf_out.write_record(record)
152+
variant_count += 1
153+
last_position = current_position
154+
155+
# Close the last chunk file
156+
vcf_out.close()
157+
vcf_in.close()
158+
159+
logger.info(f"VCF file split into {len(chunk_files)} chunks.")
160+
161+
return chunk_files
98162

99163
def merge_tsv_files(tsv_files, merged_tsv_fp):
100164
"""
101165
Merges all TSV files into a single TSV.
102166
"""
103-
logger.info("Merging TSV files...")
104-
with gzip.open(merged_tsv_fp, 'wt') as merged_tsv:
167+
with open(merged_tsv_fp, 'w') as merged_tsv:
105168
for i, tsv_file in enumerate(tsv_files):
106-
with gzip.open(tsv_file, 'rt') as infile:
169+
with open(tsv_file, 'r') as infile:
107170
for line_number, line in enumerate(infile):
108171
# Skip header except for the first file
109172
if i > 0 and line_number == 0:
@@ -123,7 +186,6 @@ def merge_vcf_files(vcf_files, merged_vcf_fp):
123186
Returns:
124187
- Path to the sorted merged VCF file.
125188
"""
126-
logger.info("Merging VCF files...")
127189
merged_vcf_fp = pathlib.Path(merged_vcf_fp)
128190
merged_unsorted_vcf = merged_vcf_fp.with_suffix('.unsorted.vcf.gz')
129191
merged_vcf = merged_vcf_fp.with_suffix('.vcf.gz')
@@ -185,4 +247,48 @@ def merge_vcf_files(vcf_files, merged_vcf_fp):
185247
if merged_unsorted_vcf.exists():
186248
merged_unsorted_vcf.unlink()
187249

188-
return merged_vcf
250+
return merged_vcf
251+
252+
def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
253+
# Step 3: Merge all chunk VCF files into a single file
254+
pcgr_dir = output_dir / 'pcgr/'
255+
pcgr_dir.mkdir(exist_ok=True)
256+
# Merge all TSV files into a single file in the pcgr directory merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
257+
merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
258+
merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
259+
# Step 5: Merge all VCF files into a single file in the pcgr directory
260+
merged_vcf_path = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38")
261+
merged_vcf = merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
262+
return merged_vcf, merged_tsv_fp
263+
264+
def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
265+
pcgr_tsv_files = []
266+
pcgr_vcf_files = []
267+
268+
num_chunks = len(vcf_chunks)
269+
# Ensure we don't use more workers than available threads, and each worker has at least 2 threads
270+
max_workers = min(num_chunks, max_threads // 2)
271+
threads_quot, threads_rem = divmod(max_threads, num_chunks)
272+
threads_per_chunk = max(2, threads_quot)
273+
274+
# Limit the number of workers to the smaller of num_chunks or max_threads // 2
275+
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
276+
futures = {}
277+
for chunk_number, vcf_file in enumerate(vcf_chunks, start=1):
278+
# Assign extra thread to the first 'threads_rem' chunks
279+
additional_thread = 1 if chunk_number <= threads_rem else 0
280+
total_threads = threads_per_chunk + additional_thread
281+
futures[executor.submit(pcgr.run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
282+
283+
for future in concurrent.futures.as_completed(futures):
284+
try:
285+
pcgr_tsv_fp, pcgr_vcf_fp = future.result()
286+
if pcgr_tsv_fp:
287+
pcgr_tsv_files.append(pcgr_tsv_fp)
288+
if pcgr_vcf_fp:
289+
pcgr_vcf_files.append(pcgr_vcf_fp)
290+
except Exception as e:
291+
print(f"Exception occurred: {e}")
292+
293+
merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
294+
return merged_tsv_fp, merged_vcf_fp

0 commit comments

Comments
 (0)