chore: minor refactoring of GCS Functions in document wrapper (#39)

holtskinner · web-flow · commit f1fa28ac3c9d · 2023-01-31T15:28:18.000-06:00
* chore: minor refactoring of GCS Functions in document wrapper
- Simplified `print_gcs_document_tree` for readaibility/maintainability (And to resolve linter errors)
- Added constants for reused values
- Added `ignore_unknown_values` to `Document.from_json()` to avoid exceptions with new Document Proto versions between client library updates

* chore: minor refactoring of GCS Functions in document wrapper
- Simplified `print_gcs_document_tree` for readaibility/maintainability (And to resolve linter errors)
- Added constants for reused values
- Added `ignore_unknown_values` to `Document.from_json()` to avoid exceptions with new Document Proto versions between client library updates

* chore: Fix to allow tests to pass
diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/constants.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/constants.py
@@ -15,3 +15,8 @@
 #
 
 USER_AGENT_PRODUCT = "documentai-toolbox"
+
+JSON_EXTENSION = ".json"
+JSON_MIMETYPE = "application/json"
+
+FILE_CHECK_REGEX = r"(.*[.].*$)"
diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py
@@ -16,8 +16,9 @@
 """Wrappers for Document AI Document type."""
 
 import dataclasses
+import os
 import re
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from google.api_core import client_info
 from google.cloud import documentai
@@ -111,9 +112,11 @@ def _get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:
     blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)
 
     for blob in blob_list:
-        if blob.name.endswith(".json"):
-            blob_as_bytes = blob.download_as_bytes()
-            result.append(blob_as_bytes)
+        if (
+            blob.name.endswith(constants.JSON_EXTENSION)
+            or blob.content_type == constants.JSON_MIMETYPE
+        ):
+            result.append(blob.download_as_bytes())
 
     return result
 
@@ -139,15 +142,15 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume
     """
     shards = []
 
-    file_check = re.match(r"(.*[.].*$)", gcs_prefix)
+    file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix)
 
     if file_check is not None:
         raise ValueError("gcs_prefix cannot contain file types")
 
     byte_array = _get_bytes(gcs_bucket_name, gcs_prefix)
 
     for byte in byte_array:
-        shards.append(documentai.Document.from_json(byte))
+        shards.append(documentai.Document.from_json(byte, ignore_unknown_fields=True))
 
     return shards
 
@@ -170,45 +173,38 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
         None.
 
     """
-    display_filename_prefix_middle = "├──"
-    display_filename_prefix_last = "└──"
+    FILENAME_TREE_MIDDLE = "├──"
+    FILENAME_TREE_LAST = "└──"
+    FILES_TO_DISPLAY = 4
 
-    file_check = re.match(r"(.*[.].*$)", gcs_prefix)
+    file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix)
 
     if file_check is not None:
         raise ValueError("gcs_prefix cannot contain file types")
 
     storage_client = _get_storage_client()
     blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)
 
-    path_list = {}
+    path_list: Dict[str, List[str]] = {}
 
     for blob in blob_list:
-        file_path = blob.name.split("/")
-        file_name = file_path.pop()
-
-        file_path2 = "/".join(file_path)
+        directory, file_name = os.path.split(blob.name)
 
-        if file_path2 in path_list:
-            path_list[file_path2] += f"{file_name},"
+        if directory in path_list:
+            path_list[directory].append(file_name)
         else:
-            path_list[file_path2] = f"{file_name},"
-
-    for key in path_list:
-        a = path_list[key].split(",")
-        a.pop()
-        print(f"{key}")
-        togo = 4
-        for idx, val in enumerate(a):
-            if idx == len(a) - 1:
-                if len(a) > 4:
+            path_list[directory] = [file_name]
+
+    for directory, files in path_list.items():
+        print(f"{directory}")
+        dir_size = len(files)
+        for idx, file_name in enumerate(files):
+            if idx == dir_size - 1:
+                if dir_size > FILES_TO_DISPLAY:
                     print("│  ....")
-                print(f"{display_filename_prefix_last}{val}\n")
-            elif len(a) > 4 and togo != -1:
-                togo -= 1
-                print(f"{display_filename_prefix_middle}{val}")
-            elif len(a) <= 4:
-                print(f"{display_filename_prefix_middle}{val}")
+                print(f"{FILENAME_TREE_LAST}{file_name}\n")
+            elif idx <= FILES_TO_DISPLAY:
+                print(f"{FILENAME_TREE_MIDDLE}{file_name}")
 
 
 @dataclasses.dataclass
@@ -268,8 +264,8 @@ def from_document_path(
                 A document from local document_path.
         """
 
-        with open(document_path, "r") as f:
-            doc = documentai.Document.from_json(f.read())
+        with open(document_path, "r", encoding="utf-8") as f:
+            doc = documentai.Document.from_json(f.read(), ignore_unknown_fields=True)
 
         return cls(shards=[doc])
 

Original file line number	Diff line number	Diff line change
`@@ -15,3 +15,8 @@`
`15`	`15`	`#`
`16`	`16`
`17`	`17`	`USER_AGENT_PRODUCT = "documentai-toolbox"`
	`18`	`+`
	`19`	`+JSON_EXTENSION = ".json"`
	`20`	`+JSON_MIMETYPE = "application/json"`
	`21`	`+`
	`22`	`+FILE_CHECK_REGEX = r"(.[.].$)"`