morrisanimalfoundation
diff --git a/‎.github/workflows/sort_and_lint.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/sort_and_lint.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 4 additions & 5 deletions b/‎Dockerfile‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 20 additions & 9 deletions b/‎README.md‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 26 additions & 0 deletions b/‎docker-compose.yml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎example.env‎
Lines changed: 2 additions & 0 deletions b/‎example.env‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎requirements.txt‎
Lines changed: 98 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎requirements/ci_requirements.txt‎
Lines changed: 5 additions & 0 deletions b/‎requirements/ci_requirements.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎requirements/full.txt‎
Lines changed: 0 additions & 46 deletions b/‎requirements/full.txt‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎run.sh‎
Lines changed: 41 additions & 10 deletions b/‎run.sh‎
Lines changed: 41 additions & 10 deletions
@@ -27,7 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        if [ -f requirements/ci_requirements.txt ]; then pip install -r requirements/ci_requirements.txt; fi
     - name: Lint with flake8 and isort
       run: |
         touch settings.py
 
@@ -1,15 +1,14 @@
 FROM python:3.11
 COPY requirements.txt .
-COPY requirements/full.txt .
 ENV PYTHONPATH=\/workspace
 ARG USER_ID
 RUN apt-get update -y && \
-    apt-get -y install poppler-utils tesseract-ocr yq
-RUN git clone https://github.com/tesseract-ocr/tessdata.git /usr/share/tessdata
-RUN useradd -l -u ${USER_ID} -g sudo jenkins && \
+    apt-get -y install poppler-utils tesseract-ocr yq &&\
+    git clone https://github.com/tesseract-ocr/tessdata.git /usr/share/tessdata &&\
+    useradd -l -u ${USER_ID} -g sudo jenkins && \
     mkdir -m 0755 /home/jenkins && chown jenkins /home/jenkins
 USER jenkins
-RUN pip install -r full.txt -r requirements.txt --trusted-host pypi.python.org --no-cache-dir && \
+RUN pip install -r requirements.txt --trusted-host pypi.python.org --no-cache-dir &&\
     python -m spacy download en_core_web_sm && \
     python -m spacy download en_core_web_lg
 ENV PATH="/home/jenkins/.local/bin:$PATH"
 
@@ -1,15 +1,26 @@
 # Veterinary Medical Record Transcriber (VMRT) Tesseract Utilities
+
 The Golden Retriever lifetime study has thousands of electronic medical records (EMRs) that have valuable information. The VMRT project is an attempt to automate data extraction from these EMRs. This repository contains some very simple and crude Tesseract scripts to help evaluate our dataset. The unstructured text extracted from the EMRs may or may not be valuable, but understanding the quantity of low confidence records is very useful.
 
-Goals
-* Build dataset to understand composition of EMRs
-  * Kind of files
-  * Enrollment status
-* Determine confidence scores for ORC extraction from PDFs
-* Evaluate extracted text to determine Tesseract fit for project
+Goals:
+
+- Build dataset to understand composition of EMRs
+- Homogenize format of PDF and text files (more to come)
+- Determine confidence scores for optical character recognition from PDFs
+- Automatically scrub personally (or dog) identifiable information (PII)
+- Perform plain text substitution on corpus
+- Extract metadata, such as subject id, study year, related visit
 
 # Running the scripts
+
 The scripts are easily run via the Dockerfile included in this repo.
-1. Build the container like usual. `docker build -t <container name> .` Run the scripts `docker run --rm -v <path to data>/data -v <path to code>/workspace <image name> <script name>`
-2. To produce a file map that is compatible with the Tesseract utilitiy run the file_info.py script over the data directory. Output is printed to stdout.
-3. To process the file map produced above run the image_to_text.py with the file map. An output directory with an `unstructured_text` folder is also required. Output is dumped to output folder.
+
+1. Copy the example.env file to .env and fill in the values.  
+    a. The value for `SQL_CONNECTION_STRING` should be the connection string for the database container. (i.e. `mysql://user:password@vmrt-emr-process-log-mysql:3306/vmrt_emr_transcription`)
+2. The easiest way to spin up the docker images is to run the `run.sh` script within the repository root directory. You can also build the containers using the docker compose file. `docker build -t <container name> .`
+3. Set up your DB by running `python /workspace/scripts/database_setup.py install` within the container.
+4. Get ready for the transcription process by running `python scripts/create_transcription_process.py /data`
+5. Use the `transcribe_pdfs.py` script to transcribe the files needed.
+    - `python /workspace/scripts/transcribe_pdfs.py /workspace/output`
+6. Use the `pii_scrubber.py` script to remove PII from the text.
+    - `python /workspace/scripts/scrubbers/pii_scrubber.py`
@@ -0,0 +1,26 @@
+services:
+  vmrt-emr-process-log-mysql:
+    env_file:
+      - path: ./.env
+        required: true
+    container_name: vmrt-emr-process-log-mysql
+    image: mysql:8.0
+    command: --default-authentication-plugin=mysql_native_password
+    restart: always
+    environment:
+      MYSQL_ROOT_PASSWORD: $SQL_PASSWORD
+  vmrt-emr-workspace:
+    container_name: vmrt-emr-workspace
+    build: .
+    command: tail -f /dev/null
+    volumes:
+      - ./:/workspace
+      - emr-source:/data
+
+volumes:
+  emr-source:
+    driver: local
+    driver_opts:
+      o: bind
+      type: none
+      device: "$HOME/MAF\ Dropbox/GRLS/Operations/ENROLLED\ DOGS"
@@ -0,0 +1,2 @@
+SQL_CONNECTION_STRING=''
+SQL_PASSWORD=''
@@ -6,7 +6,9 @@ build-backend = "setuptools.build_meta"
 name = "vmrt_tesseract_utilities"
 version = "1.0"
 description = "A utility script for extracting text and scrubbing PII from EMR data."
-packages=["vmrt_tesseract_utilities"]
 authors = [
   {name = "Morris Animal Foundation", email = "[email protected]"},
-]
+]
+
+[tool.setuptools]
+packages=["vmrt_tesseract_utilities"]
@@ -1,5 +1,103 @@
+annotated-types==0.6.0
+azure-core==1.32.0
+beautifulsoup4==4.12.3
+blis==1.1.0
+catalogue==2.0.10
+certifi==2024.12.14
+cffi==1.16.0
+charset-normalizer==3.4.0
+click==8.1.8
+cloudpathlib==0.20.0
+colorclass==2.2.2
+compressed-rtf==1.0.6
+confection==0.1.5
+cryptography==43.0.0
+cymem==2.0.10
+dnspython==2.6.1
+easygui==0.98.3
+ebcdic==1.1.1
+email_validator==2.1.1
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl#sha256=293e9547a655b25499198ab15a525b05b9407a75f10255e405e8c3854329ab63
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
+extract-msg==0.48.7
+filelock==3.16.1
 flake8==7.1.1
+fsspec==2024.12.0
+greenlet==3.1.1
+huggingface-hub==0.27.0
+idna==3.7
 isort==5.13.2
+Jinja2==3.1.5
+langcodes==3.5.0
+language_data==1.3.0
+lark==1.1.9
+marisa-trie==1.2.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
 mccabe==0.7.0
+mdurl==0.1.2
+mpmath==1.3.0
+msoffcrypto-tool==5.4.1
+murmurhash==1.0.11
+mysqlclient==2.2.6
+networkx==3.4.2
+numpy==2.0.2
+olefile==0.47
+oletools==0.60.2
+packaging==24.0
+pandas==2.2.2
+pcodedmp==1.2.6
+pdf2image==1.17.0
+phonenumbers==8.13.52
+pillow==10.3.0
+preshed==3.0.9
+presidio_analyzer==2.2.355
+presidio_anonymizer==2.2.355
 pycodestyle==2.12.1
+pycparser==2.22
+pycryptodome==3.21.0
+pydantic==2.7.1
+pydantic_core==2.18.2
 pyflakes==3.2.0
+Pygments==2.18.0
+PyMySQL==1.1.1
+pyparsing==3.1.2
+pytesseract==0.3.10
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.2
+red-black-tree-mod==1.20
+regex==2024.11.6
+requests==2.32.3
+requests-file==2.1.0
+rich==13.9.4
+RTFDE==0.1.2
+safetensors==0.4.5
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.1.0
+soupsieve==2.5
+spacy==3.8.2
+spacy-huggingface-pipelines==0.0.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==2.0.36
+srsly==2.5.0
+sympy==1.13.1
+tesseract==0.1.3
+tesserocr==2.7.0
+thinc==8.3.3
+tldextract==5.1.3
+tokenizers==0.21.0
+torch==2.5.1
+tqdm==4.67.1
+transformers==4.47.1
+typer==0.15.1
+typing_extensions==4.11.0
+tzdata==2024.1
+tzlocal==5.2
+urllib3==2.3.0
+wasabi==1.1.3
+weasel==0.4.1
+wrapt==1.17.0
@@ -0,0 +1,5 @@
+flake8==7.1.1
+isort==5.13.2
+mccabe==0.7.0
+pycodestyle==2.12.1
+pyflakes==3.2.0
@@ -1,16 +1,47 @@
 #!/usr/bin/env bash
 
-# Should provide the directory where this script lives in most cases.
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+# Exit on Error.
+set -e
 
-# The name of our image from the Gitlab Container Registry.
-IMAGE_NAME="registry.gitlab.com/morrisanimalfoundation/grls:vmrt-tesseract-utilities"
+# Read our .env file.
+export $(grep -v '^#' .env | xargs)
 
-# Build the image with our special build args.
-# These matter more on Jenkins, but need to be placeheld anyway.
-docker image build -t $IMAGE_NAME --cache-from $IMAGE_NAME --cache-to type=inline --build-arg USER_ID=$(id -u ${USER}) .
+if [[ -z $SQL_PASSWORD ]]; then
+  echo "Error: Please set the SQL_PASSWORD variable in your .env file."
+  exit 1
+fi
 
 
-# Run the container in a disposable manner.
-# Add a volume to the current working dir.
-docker run --rm -it -v $HOME/MAF\ Dropbox/GRLS/Operations/ENROLLED\ DOGS:/data -v $SCRIPT_DIR:/workspace -v $HOME/.ssh:/home/jenkins/.ssh $IMAGE_NAME bash
+# Build the Docker images with the current user's ID.
+docker compose build --build-arg USER_ID=$(id -u ${USER})
+
+# Start the containers in detached mode.
+docker compose up -d
+
+# Wait for the database container to start (with a timeout).
+TIMEOUT=30
+COUNTER=0
+until $(docker exec -i vmrt-emr-process-log-mysql mysql -uroot -p$SQL_PASSWORD -e "DROP DATABASE IF EXISTS vmrt_emr_transcription; CREATE DATABASE vmrt_emr_transcription;") || [[ $COUNTER -eq $TIMEOUT ]]; do
+  echo "Waiting for database container to start... ($COUNTER/$TIMEOUT)"
+  sleep 1
+  COUNTER=$((COUNTER+1))
+done
+
+if [[ $COUNTER -eq $TIMEOUT ]]; then
+  echo "Error: Timeout waiting for database container."
+  exit 1
+fi
+
+echo "Database initialized successfully."
+
+# Execute the Python script.
+if ! docker exec -t vmrt-emr-workspace python ./scripts/database_setup.py install; then
+  echo "Error: Failed to execute Python script."
+  exit 1
+fi
+
+# Provide an interactive Bash shell within the container.
+docker exec -it vmrt-emr-workspace bash
+
+# Stop and remove the containers.
+docker compose down
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+SQL_CONNECTION_STRING=''`
	`2`	`+SQL_PASSWORD=''`