morrisanimalfoundation
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎scripts/metadata_miners/visit_date_miner.py‎
Lines changed: 2 additions & 4 deletions b/‎scripts/metadata_miners/visit_date_miner.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎scripts/replace_strings.py‎
Lines changed: 237 additions & 0 deletions b/‎scripts/replace_strings.py‎
Lines changed: 237 additions & 0 deletions
diff --git a/‎scripts/scrubbers/pii_scrubber.py‎
Lines changed: 21 additions & 51 deletions b/‎scripts/scrubbers/pii_scrubber.py‎
Lines changed: 21 additions & 51 deletions
@@ -22,7 +22,9 @@ The scripts are easily run via the Dockerfile included in this repo.
 4. Get ready for the transcription process by running `python scripts/create_transcription_process.py /data`
 5. Use the `transcribe_pdfs.py` script to transcribe the files needed.
     - `python /workspace/scripts/transcribe_pdfs.py /workspace/output`
-6. Use the `pii_scrubber.py` script to remove PII from the text.
+6. Use the `/workspace/scripts/replace_strings.py` script to replace text in the files.
+    - `python /workspace/scripts/replace_strings.py /path/to/dog_profile.tsv "subject_id" "<ID>" /workspace/output`
+7. Use the `pii_scrubber.py` script to remove PII from the text.
     - `python /workspace/scripts/scrubbers/pii_scrubber.py /workspace/output`
-7. Use the scripts in the `scripts/metadata_miners` directory to find data in the text.
+8. Use the scripts in the `scripts/metadata_miners` directory to find data in the text.
     - `python /workspace/scripts/metadata_miners/visit_date_miner.py /workspace/output --visit_date_tsv=/path/to/vet_visits.tsv --dog_profile_tsv=/path/to/dog_profile.tsv`
@@ -49,8 +49,7 @@ def get_date_pairs_within_days(
     return result_pairs
 
 
-def get_dog_dates(parsed_args: argparse.Namespace, subject_id: str) -> Tuple[
-    Optional[datetime.date], Optional[datetime.date]]:
+def get_dog_dates(parsed_args: argparse.Namespace, subject_id: str) -> Tuple[Optional[datetime.date], Optional[datetime.date]]:
     """
     Retrieves the dog's birth and death dates from the TSV files.
 
@@ -136,8 +135,7 @@ def update_existing_records(session: sqlalchemy.orm.session.Session, subject_id:
 
 
 def get_existing_date_pairs(session: sqlalchemy.orm.session.Session, subject_id: str, input_id: int,
-                            date_pairs: set[tuple[datetime, datetime]]) -> Set[
-    Tuple[datetime.date, datetime.date]]:
+                            date_pairs: set[tuple[datetime, datetime]]) -> Set[Tuple[datetime.date, datetime.date]]:
     """
     Retrieves existing date pairs from the database.
 
 
@@ -0,0 +1,237 @@
+import argparse
+import csv
+import os
+import sys
+import traceback
+from concurrent.futures import ProcessPoolExecutor
+from typing import List, Optional
+
+from vmrt_tesseract_utilities.database import (TranscriptionInput,
+                                               TranscriptionOutput,
+                                               get_database_session)
+from vmrt_tesseract_utilities.logging import stdout_logger
+from vmrt_tesseract_utilities.string_replacer import StringReplacer
+
+"""
+Replaces strings in scrubbed text files from a CSV or TSV.
+"""
+
+
+def read_target_strings(data_file: str, key_column: str) -> List[str]:
+    """
+    Reads target strings from a CSV or TSV file.
+
+    Parameters
+    ----------
+    data_file : str
+        The path to the CSV or TSV file.
+    key_column : str
+        The name of the column containing the target strings.
+
+    Returns
+    -------
+    list
+        A list of strings extracted from the specified column.
+
+    Raises
+    ------
+    ValueError
+        If the provided file is not a CSV or TSV, or if the specified key column is not found.
+    FileNotFoundError
+        If the specified data file does not exist.
+    """
+    strings = []
+    # Detect file type based on extension
+    if data_file.endswith(".csv"):
+        delimiter = ","
+    elif data_file.endswith(".tsv"):
+        delimiter = "\t"
+    else:
+        raise ValueError("Invalid data file format. The data_file must be a CSV or TSV.")
+    try:
+        # Read the target strings from the specified column.
+        with open(data_file, "r", newline="") as csvfile:
+            reader = csv.DictReader(csvfile, delimiter=delimiter)
+            if key_column not in reader.fieldnames:
+                raise ValueError(f"Key column '{key_column}' not found in data file.")
+            strings.extend([row[key_column] for row in reader])
+        return strings
+    except FileNotFoundError as e:
+        stdout_logger.error(f"Data file not found: {e}")
+        raise
+    except Exception as e:
+        stdout_logger.error(f"An error occurred in read_target_strings: {e}\n{traceback.format_exc()}")
+        raise
+
+
+def process_file(output_log, strings_to_replace: List[str], parsed_args: argparse.Namespace) -> Optional[object]:
+    """
+    Processes a single file, replacing text and writing outputs.
+
+    Parameters
+    ----------
+    output_log : TranscriptionOutput
+        An object containing file data.
+    strings_to_replace : list
+        The list of strings to search for.
+    parsed_args : argparse.Namespace
+        The parsed args.
+
+    Returns
+    -------
+    TranscriptionOutput
+        The updated file data object.
+    """
+    input_file = None
+    try:
+        if hasattr(output_log, 'list_replacement_output_file') and output_log.list_replacement_output_file:
+            input_file = output_log.list_replacement_output_file
+            output_file = input_file
+        else:
+            input_file = output_log.ocr_output_file
+            input_filename = os.path.basename(str(input_file))
+            filename_without_extension = os.path.splitext(input_filename)[0]
+            output_dir = f"{parsed_args.output_dir}/list_replacement_output_file/{parsed_args.document_type}"
+            os.makedirs(output_dir, exist_ok=True)
+            output_file = f"{output_dir}/{filename_without_extension}.txt"
+
+        with open(str(input_file), "r") as f:
+            orig_text = f.read()
+
+        replacer = StringReplacer(strings_to_replace, parsed_args.replacement_string)
+        scrubbed_text = replacer.replace(orig_text)
+
+        with open(output_file, "w") as outfile:
+            outfile.write(scrubbed_text)
+        stdout_logger.info(f"Scrubbed file written to {output_file}")
+
+        output_log.list_replacement_output_file = output_file
+        return output_log
+    except Exception as e:
+        stdout_logger.error(f"An error occurred while processing {input_file}: {e}\n{traceback.format_exc()}")
+        return None
+
+
+def scrub_and_write_files(process_filepath_data: List[object], strings_to_replace: List[str], parsed_args: argparse.Namespace, use_multiprocessing: bool = True) -> None:
+    """
+    Processes a list of files, replacing text and writing outputs in parallel.
+
+    Parameters
+    ----------
+    process_filepath_data : list
+        A list of file data objects.
+    strings_to_replace : list
+        The list of strings to search for.
+    parsed_args : argparse.Namespace
+        The parsed args.
+    use_multiprocessing : bool
+        Whether to use multiprocessing or not.
+    """
+    session_maker = get_database_session(echo=parsed_args.debug_sql)
+    batch_size = parsed_args.chunk_size
+    success = True
+    for i in range(0, len(process_filepath_data), batch_size):
+        batch = process_filepath_data[i:i + batch_size]
+        with session_maker.begin() as session:
+            if use_multiprocessing:
+                with ProcessPoolExecutor(max_workers=parsed_args.max_workers) as executor:
+                    results = executor.map(process_file_wrapper, batch, [strings_to_replace] * len(batch), [parsed_args] * len(batch))
+                    for output_log in results:
+                        if output_log:
+                            session.add(output_log)
+                        else:
+                            success = False
+            else:
+                for output_log in batch:
+                    result = process_file_wrapper(output_log, strings_to_replace, parsed_args)
+                    if result:
+                        session.add(result)
+                    else:
+                        success = False
+    if not success:
+        # If any files failed to process, exit with an error code.
+        sys.exit(1)
+
+
+def process_file_wrapper(output_log: object, strings_to_replace: List[str], parsed_args: argparse.Namespace) -> Optional[object]:
+    """
+    Wrapper function to create a new session for each process.
+
+    Parameters
+    ----------
+    output_log : TranscriptionOutput
+        An object containing file data.
+    strings_to_replace : list
+        The list of strings to search for.
+    parsed_args : argparse.Namespace
+        The parsed args.
+
+    Returns
+    -------
+    TranscriptionOutput
+        The updated file data object.
+    """
+    session_maker = get_database_session(echo=parsed_args.debug_sql)
+    with session_maker.begin() as session:
+        session.add(output_log)
+        return process_file(output_log, strings_to_replace, parsed_args)
+
+
+def get_files_to_process(args: argparse.Namespace) -> list:
+    """
+    Gets a list of input files to process.
+
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The parsed args.
+
+    Returns
+    -------
+    results: list
+      The list of input files.
+    """
+    sessionmaker = get_database_session(echo=args.debug_sql)
+    with sessionmaker.begin() as session:
+        query = (session.query(TranscriptionOutput)
+                 .outerjoin(TranscriptionInput.assets)
+                 .where(TranscriptionInput.document_type == args.document_type)
+                 .where(TranscriptionOutput.ocr_output_file != None)  # noqa: E711
+                 .limit(args.chunk_size)
+                 .offset(args.offset))
+    return query.all()
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parses the required args.
+
+    Returns
+    -------
+    args : argparse.Namespace
+        The parsed args.
+    """
+    parser = argparse.ArgumentParser(description="Replace strings in scrubbed text files from a CSV or TSV.")
+    parser.add_argument("data_file", help="Path to the CSV or TSV file containing the strings.")
+    parser.add_argument("key_column", help="Name of the column in the CSV/TSV containing the keys.")
+    parser.add_argument("replacement_string", help="The string to replace the keys with.")
+    parser.add_argument("output_dir", help="Path to the output directory.")
+    parser.add_argument("--document_type", type=str, default="document", help="The document type we want to produce, document, page or block.")
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The number of records to process.")
+    parser.add_argument("--offset", type=int, default=0, help="The number of records to skip before beginning processing.")
+    parser.add_argument("--debug-sql", action="store_true", help="Enable SQL debugging")
+    parser.add_argument("--no-multiprocessing", action="store_true", help="Disable multiprocessing for debugging")
+    parser.add_argument("--max-workers", type=int, default=4, help="Maximum number of worker processes for multiprocessing")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    try:
+        results = get_files_to_process(args)
+        target_strings = read_target_strings(args.data_file, args.key_column)
+        scrub_and_write_files(results, target_strings, args, not args.no_multiprocessing)
+    except Exception as e:
+        stdout_logger.error(f"Error in main execution: {e}\n{traceback.format_exc()}")
+        # If an error occurs, exit with an error code.
+        sys.exit(1)
@@ -10,6 +10,7 @@
                                                TranscriptionOutput,
                                                get_database_session)
 from vmrt_tesseract_utilities.logging import stdout_logger
+from vmrt_tesseract_utilities.scrubbing_utils import write_scrubbed_txt
 
 """
 Leverages presidio to attempt automatic PII stripping.
@@ -83,26 +84,6 @@ def scrub_pii(text: str, analyzer: AnalyzerEngine, threshold: float) -> tuple[st
         raise
 
 
-def write_scrubbed_txt(output_filename: str, anonymized_text: str) -> None:
-    """
-    Writes the anonymized text to an output file.
-
-    Parameters
-    ----------
-    output_filename : str
-        The path to the file.
-    anonymized_text : str
-        The anonymized text.
-    """
-    try:
-        if anonymized_text:
-            with open(output_filename, 'w') as f:
-                f.write(anonymized_text)
-    except Exception as e:
-        stdout_logger.error(f'Error writing scrubbed output: {e}')
-        raise
-
-
 def write_confidence_record(filename: str, filtered_results: list, original_text: str) -> None:
     """
     Writes the filtered results to a JSON file.
@@ -133,31 +114,6 @@ def write_confidence_record(filename: str, filtered_results: list, original_text
         raise
 
 
-def get_output_strategy_from_path(file_path: str) -> str:
-    """
-    Determines the type of path based on path segments.
-
-    Parameters
-    ----------
-    file_path : str
-        The file_path to check.
-
-    Returns
-    -------
-    str
-        The extracted path type.
-    """
-    parts = file_path.split(os.sep)
-    if 'doc' in parts:
-        return 'doc'
-    elif 'page' in parts:
-        return 'page'
-    elif "unstructured_text" in parts:
-        return parts[parts.index("unstructured_text") + 1]
-    else:
-        return 'page'  # Default to page
-
-
 def process_files(process_filepath_data: list, analyzer: AnalyzerEngine,
                   output_dir: str, threshold: float) -> None:
     """
@@ -175,24 +131,38 @@ def process_files(process_filepath_data: list, analyzer: AnalyzerEngine,
         The confidence threshold for PII detection.
     """
     sessionmaker = get_database_session(echo=args.debug_sql)
-    with sessionmaker.begin() as session:
+    with sessionmaker() as session:
         for output_log in process_filepath_data:
-            with open(str(output_log.ocr_output_file), 'r') as f:
+            session.add(output_log)  # Ensure the instance is bound to the session
+            # Use the replacement file if it exists, otherwise use the OCR output.
+            if hasattr(output_log, 'list_replacement_output_file') and output_log.list_replacement_output_file:
+                input_filepath = output_log.list_replacement_output_file
+            else:
+                input_filepath = output_log.ocr_output_file
+            # Read the original text from the input file.
+            with open(str(input_filepath), 'r') as f:
                 orig_text = f.read()
+            stdout_logger.info(f"Scrubbing {input_filepath}")
+            # Scrub the PII from the text.
             scrubbed_text, result_output = scrub_pii(orig_text, analyzer, threshold)
-            input_filename = os.path.basename(str(output_log.ocr_output_file))
+            # Write the scrubbed text to a file.
+            input_filename = os.path.basename(str(input_filepath))
             filename_without_extension = os.path.splitext(input_filename)[0]
             scrubbed_dir = f'{output_dir}/scrubbed_text/{args.document_type}/scrubbed_{args.document_type}'
             os.makedirs(scrubbed_dir, exist_ok=True)  # Create directory if needed
             output_file = f'{scrubbed_dir}/{filename_without_extension}.txt'
             write_scrubbed_txt(output_file, scrubbed_text)
+            stdout_logger.info(f"Scrubbed file written to {output_file}")
+            # Write the scrubbed confidence data to a file.
             output_log.pii_scrubber_output_file = output_file
             confidence_dir = f'{output_dir}/scrubbed_text/{args.document_type}/scrubbed_confidence'
             os.makedirs(confidence_dir, exist_ok=True)
             confidence_file = f'{confidence_dir}/confidence-{filename_without_extension}.json'
             output_log.pii_scrubber_confidence_file = confidence_file
             write_confidence_record(confidence_file, result_output, orig_text)
+            # Log the changes to the database.
             session.add(output_log)
+        session.commit()
 
 
 def get_files_to_process(args: argparse.Namespace) -> list:
@@ -214,8 +184,8 @@ def get_files_to_process(args: argparse.Namespace) -> list:
         query = (session.query(TranscriptionOutput)
                  .outerjoin(TranscriptionInput.assets)
                  .where(TranscriptionInput.document_type == args.document_type)
-                 .where(TranscriptionOutput.ocr_output_file != None)
-                 .where(TranscriptionOutput.pii_scrubber_output_file == None)
+                 .where(TranscriptionOutput.ocr_output_file != None)  # noqa: E711
+                 .where(TranscriptionOutput.pii_scrubber_output_file == None)  # noqa: E711
                  .limit(args.chunk_size)
                  .offset(args.offset))
     return query.all()
@@ -252,4 +222,4 @@ def parse_args() -> argparse.Namespace:
     args = parse_args()
     db_output_logs = get_files_to_process(args)
     nlp_engine = create_nlp_engine(args.config)
-    process_files(db_output_logs, nlp_engine, args.output_to, args.threshold)
+    process_files(db_output_logs, nlp_engine, args.output_to, args.threshold)