umccr
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 3 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎bolt/common/constants.py
Lines changed: 38 additions & 1 deletion b/‎bolt/common/constants.py
Lines changed: 38 additions & 1 deletion
diff --git a/‎bolt/common/pcgr.py
Lines changed: 5 additions & 6 deletions b/‎bolt/common/pcgr.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎bolt/logging_config.py
Lines changed: 34 additions & 0 deletions b/‎bolt/logging_config.py
Lines changed: 34 additions & 0 deletions
diff --git a/‎bolt/util.py
Lines changed: 140 additions & 34 deletions b/‎bolt/util.py
Lines changed: 140 additions & 34 deletions
@@ -5,3 +5,5 @@ __pycache__/
 build/
 venv/
 working/
+data/
+workspace/
@@ -4,4 +4,6 @@
 
 - [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling
 
-- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
+- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
+
+-[9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling
@@ -4,7 +4,7 @@
 ######################################
 ## Variation selection (annotation) ##
 ######################################
-MAX_SOMATIC_VARIANTS = 500_000
+MAX_SOMATIC_VARIANTS = 450_000
 MAX_SOMATIC_VARIANTS_GNOMAD_FILTER = 0.01
 
 
@@ -41,6 +41,43 @@
 }
 
 
+################################
+## Hypermutated report filter ##
+################################
+PCGR_TIERS_FILTERING = (
+    'TIER_1',
+    'TIER_2',
+    'TIER_3',
+    'TIER_4',
+    'NONCODING',
+)
+
+VEP_IMPACTS_FILTER = (
+    'intergenic',
+    'intronic',
+    'downstream',
+    'upstream',
+    'impacts_other',
+)
+
+GENOMIC_REGIONS_FILTERING = (
+    'difficult',
+    'none',
+    'giab_conf',
+)
+
+HOTSPOT_FIELDS_FILTERING = (
+    'SAGE_HOTSPOT',
+    'hotspot',
+    'PCGR_MUTATION_HOTSPOT',
+)
+
+RETAIN_FIELDS_FILTERING = (
+    'PANEL',
+    *HOTSPOT_FIELDS_FILTERING,
+)
+
+
 ##################################################
 ## VCF FILTER tags and FORMAT, INFO annotations ##
 ##################################################
 
@@ -7,13 +7,16 @@
 import re
 import shutil
 import tempfile
+import logging
 
 import cyvcf2
 
 
 from .. import util
 from ..common import constants
 
+# Use the existing logger configuration
+logger = logging.getLogger(__name__)
 
 def prepare_vcf_somatic(input_fp, tumor_name, normal_name, output_dir):
 
@@ -179,7 +182,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
 
     command = fr'''
         pcgr \
-          {command_args_str}
+            {command_args_str}
     '''
 
     if pcgr_conda:
@@ -262,7 +265,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
     return cpsr_output_dir
 
 
-def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, output_dir):
+def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, output_dir):
     # Set destination INFO field names and source TSV fields
     info_field_map = {
         constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT',
@@ -297,10 +300,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
         # Do not process chrM since *snvs_indels.tiers.tsv does not include these annotations
         if record.CHROM == 'chrM':
             continue
-        # Immediately print out variants that were not annotated
-        if filter_name in record.FILTERS:
-            output_fh.write_record(record)
-            continue
         # Annotate and write
         record_ann = annotate_record(record, pcgr_data, allow_missing=True)
         output_fh.write_record(record_ann)
 
@@ -0,0 +1,34 @@
+import logging
+import sys
+import pathlib
+from datetime import datetime
+
+class IgnoreTinfoFilter(logging.Filter):
+    def filter(self, record):
+        # Exclude messages that contain the unwanted text.
+        if "no version information available" in record.getMessage():
+            return False
+        return True
+
+def setup_logging(output_dir, script_name):
+    # Create a timestamp for the log file
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    log_filename = f"{script_name}_{timestamp}.log"
+    log_file = pathlib.Path(output_dir) / log_filename
+
+    # Create individual handlers.
+    console_handler = logging.StreamHandler(sys.stdout)
+    file_handler = logging.FileHandler(log_file)
+
+    # Instantiate and attach the filter to both handlers.
+    tinfo_filter = IgnoreTinfoFilter()
+    console_handler.addFilter(tinfo_filter)
+    file_handler.addFilter(tinfo_filter)
+
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[file_handler, console_handler]
+    )
+    logger = logging.getLogger(__name__)
+    logger.info("Logging setup complete")
@@ -1,12 +1,15 @@
 import gzip
+import os
 import pathlib
 import subprocess
-import sys
 import textwrap
-
+import logging
+from types import SimpleNamespace
 
 from .common import constants
 
+# Set up logging
+logger = logging.getLogger(__name__)
 
 # TODO(SW): create note that number this assumes location of `<root>/<package>/<file>`
 def get_project_root():
@@ -16,46 +19,58 @@ def get_project_root():
     return project_root
 
 
-def execute_command(command):
-    command_prepared = command_prepare(command)
+def execute_command(command, log_file_path=None):
+    logger.info("Executing command: %s", command.strip())
 
-    print(command_prepared)
+    # Open the log file if provided
+    log_file = log_file_path.open('a', encoding='utf-8') if log_file_path else None
 
-    process = subprocess.run(
-        command_prepared,
+    # Launch process with combined stdout and stderr streams, and line buffering enabled.
+    process = subprocess.Popen(
+        command,
         shell=True,
         executable='/bin/bash',
-        capture_output=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
         encoding='utf-8',
+        bufsize=1  # line buffered
     )
 
-    if process.returncode != 0:
-        print(process)
-        print(process.stderr)
-        sys.exit(1)
-
-    return process
+    output_lines = []
+    # Iterate over each line as it becomes available
+    with process.stdout:
+        for line in iter(process.stdout.readline, ''):
+            if line:
+                logger.info(line.strip())
+                output_lines.append(line)
+                if log_file:
+                    log_file.write(line)
+                    log_file.flush()  # flush immediately for real-time logging
+    process.wait()  # wait for the process to complete
+
+    if log_file:
+        log_file.close()
+
+    result = SimpleNamespace(
+        stdout=''.join(output_lines),
+        returncode=process.returncode,
+        pid=process.pid,
+        command=command
+    )
 
+    return result
 
 def command_prepare(command):
     return f'set -o pipefail; {textwrap.dedent(command)}'
 
-
-#def count_vcf_records(fp, exclude_args=None):
-#    args = list()
-#    if exclude_args:
-#        args.append(f'-e \'{exclude_args}\'')
-#
-#    args_str = ' '.join(args)
-#    command = f'bcftools view -H {args_str} {fp} | wc -l'
-#
-#    result = execute_command(command)
-#    return int(result.stdout)
-
-
 def count_vcf_records(fp):
-    result = execute_command(f'bcftools view -H {fp} | wc -l')
-    return int(result.stdout)
+    result = subprocess.run(f'bcftools view -H {fp} | wc -l',
+                            shell=True,
+                            executable="/bin/bash",
+                            capture_output=True,
+                            text=True )
+    return int(result.stdout.strip())
 
 
 def add_vcf_header_entry(fh, anno_enum):
@@ -95,15 +110,63 @@ def get_qualified_vcf_annotation(anno_enum):
     assert anno_enum in constants.VcfInfo or anno_enum in constants.VcfFormat
     return f'{anno_enum.namespace}/{anno_enum.value}'
 
+def split_vcf(input_vcf, output_dir):
+    """
+    Splits a VCF file into multiple chunks, each containing up to max_variants variants.
+    Each chunk includes the VCF header.
+    Ensures no overlapping positions between chunks.
+    """
+    output_dir = pathlib.Path(output_dir / "vcf_chunks")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    chunk_files = []
+    chunk_number = 1
+    variant_count = 0
+    base_filename = pathlib.Path(input_vcf).stem
+    chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+    base_filename = input_vcf.stem
+    chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+    chunk_files.append(chunk_filename)
+
+    # Open the input VCF using cyvcf2
+    vcf_in = cyvcf2.VCF(input_vcf)
+    # Create a new VCF file for the first chunk
+    vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+
+    last_position = None
+
+    for record in vcf_in:
+        current_position = record.POS
+        # Check if we need to start a new chunk
+        if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
+            # Close the current chunk file and start a new one
+            vcf_out.close()
+            chunk_number += 1
+            chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+            chunk_files.append(chunk_filename)
+            vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+            variant_count = 0
+
+        # Write the record to the current chunk
+        vcf_out.write_record(record)
+        variant_count += 1
+        last_position = current_position
+
+    # Close the last chunk file
+    vcf_out.close()
+    vcf_in.close()
+
+    logger.info(f"VCF file split into {len(chunk_files)} chunks.")
+
+    return chunk_files
 
 def merge_tsv_files(tsv_files, merged_tsv_fp):
     """
     Merges all TSV files into a single TSV.
     """
-    logger.info("Merging TSV files...")
-    with gzip.open(merged_tsv_fp, 'wt') as merged_tsv:
+    with open(merged_tsv_fp, 'w') as merged_tsv:
         for i, tsv_file in enumerate(tsv_files):
-            with gzip.open(tsv_file, 'rt') as infile:
+            with open(tsv_file, 'r') as infile:
                 for line_number, line in enumerate(infile):
                     # Skip header except for the first file
                     if i > 0 and line_number == 0:
@@ -123,7 +186,6 @@ def merge_vcf_files(vcf_files, merged_vcf_fp):
     Returns:
     - Path to the sorted merged VCF file.
     """
-    logger.info("Merging VCF files...")
     merged_vcf_fp = pathlib.Path(merged_vcf_fp)
     merged_unsorted_vcf = merged_vcf_fp.with_suffix('.unsorted.vcf.gz')
     merged_vcf = merged_vcf_fp.with_suffix('.vcf.gz')
@@ -185,4 +247,48 @@ def merge_vcf_files(vcf_files, merged_vcf_fp):
     if merged_unsorted_vcf.exists():
         merged_unsorted_vcf.unlink()
 
-    return merged_vcf
+    return merged_vcf
+
+def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
+    # Step 3: Merge all chunk VCF files into a single file
+    pcgr_dir = output_dir / 'pcgr/'
+    pcgr_dir.mkdir(exist_ok=True)
+    # Merge all TSV files into a single file in the pcgr directory    merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
+    merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
+    merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
+    # Step 5: Merge all VCF files into a single file in the pcgr directory
+    merged_vcf_path = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38")
+    merged_vcf = merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
+    return merged_vcf, merged_tsv_fp
+
+def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
+    pcgr_tsv_files = []
+    pcgr_vcf_files = []
+
+    num_chunks = len(vcf_chunks)
+    # Ensure we don't use more workers than available threads, and each worker has at least 2 threads
+    max_workers = min(num_chunks, max_threads // 2)
+    threads_quot, threads_rem = divmod(max_threads, num_chunks)
+    threads_per_chunk = max(2, threads_quot)
+
+    # Limit the number of workers to the smaller of num_chunks or max_threads // 2
+    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {}
+        for chunk_number, vcf_file in enumerate(vcf_chunks, start=1):
+            # Assign extra thread to the first 'threads_rem' chunks
+            additional_thread = 1 if chunk_number <= threads_rem else 0
+            total_threads = threads_per_chunk + additional_thread
+            futures[executor.submit(pcgr.run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
+
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                pcgr_tsv_fp, pcgr_vcf_fp = future.result()
+                if pcgr_tsv_fp:
+                    pcgr_tsv_files.append(pcgr_tsv_fp)
+                if pcgr_vcf_fp:
+                    pcgr_vcf_files.append(pcgr_vcf_fp)
+            except Exception as e:
+                print(f"Exception occurred: {e}")
+
+    merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
+    return merged_tsv_fp, merged_vcf_fp