Skip to content

Commit f1fa28a

Browse files
authored
chore: minor refactoring of GCS Functions in document wrapper (#39)
* chore: minor refactoring of GCS Functions in document wrapper - Simplified `print_gcs_document_tree` for readaibility/maintainability (And to resolve linter errors) - Added constants for reused values - Added `ignore_unknown_values` to `Document.from_json()` to avoid exceptions with new Document Proto versions between client library updates * chore: minor refactoring of GCS Functions in document wrapper - Simplified `print_gcs_document_tree` for readaibility/maintainability (And to resolve linter errors) - Added constants for reused values - Added `ignore_unknown_values` to `Document.from_json()` to avoid exceptions with new Document Proto versions between client library updates * chore: Fix to allow tests to pass
1 parent eeb1f98 commit f1fa28a

File tree

2 files changed

+35
-34
lines changed

2 files changed

+35
-34
lines changed

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,8 @@
1515
#
1616

1717
USER_AGENT_PRODUCT = "documentai-toolbox"
18+
19+
JSON_EXTENSION = ".json"
20+
JSON_MIMETYPE = "application/json"
21+
22+
FILE_CHECK_REGEX = r"(.*[.].*$)"

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
"""Wrappers for Document AI Document type."""
1717

1818
import dataclasses
19+
import os
1920
import re
20-
from typing import List, Optional
21+
from typing import Dict, List, Optional
2122

2223
from google.api_core import client_info
2324
from google.cloud import documentai
@@ -111,9 +112,11 @@ def _get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:
111112
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)
112113

113114
for blob in blob_list:
114-
if blob.name.endswith(".json"):
115-
blob_as_bytes = blob.download_as_bytes()
116-
result.append(blob_as_bytes)
115+
if (
116+
blob.name.endswith(constants.JSON_EXTENSION)
117+
or blob.content_type == constants.JSON_MIMETYPE
118+
):
119+
result.append(blob.download_as_bytes())
117120

118121
return result
119122

@@ -139,15 +142,15 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume
139142
"""
140143
shards = []
141144

142-
file_check = re.match(r"(.*[.].*$)", gcs_prefix)
145+
file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix)
143146

144147
if file_check is not None:
145148
raise ValueError("gcs_prefix cannot contain file types")
146149

147150
byte_array = _get_bytes(gcs_bucket_name, gcs_prefix)
148151

149152
for byte in byte_array:
150-
shards.append(documentai.Document.from_json(byte))
153+
shards.append(documentai.Document.from_json(byte, ignore_unknown_fields=True))
151154

152155
return shards
153156

@@ -170,45 +173,38 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
170173
None.
171174
172175
"""
173-
display_filename_prefix_middle = "├──"
174-
display_filename_prefix_last = "└──"
176+
FILENAME_TREE_MIDDLE = "├──"
177+
FILENAME_TREE_LAST = "└──"
178+
FILES_TO_DISPLAY = 4
175179

176-
file_check = re.match(r"(.*[.].*$)", gcs_prefix)
180+
file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix)
177181

178182
if file_check is not None:
179183
raise ValueError("gcs_prefix cannot contain file types")
180184

181185
storage_client = _get_storage_client()
182186
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)
183187

184-
path_list = {}
188+
path_list: Dict[str, List[str]] = {}
185189

186190
for blob in blob_list:
187-
file_path = blob.name.split("/")
188-
file_name = file_path.pop()
189-
190-
file_path2 = "/".join(file_path)
191+
directory, file_name = os.path.split(blob.name)
191192

192-
if file_path2 in path_list:
193-
path_list[file_path2] += f"{file_name},"
193+
if directory in path_list:
194+
path_list[directory].append(file_name)
194195
else:
195-
path_list[file_path2] = f"{file_name},"
196-
197-
for key in path_list:
198-
a = path_list[key].split(",")
199-
a.pop()
200-
print(f"{key}")
201-
togo = 4
202-
for idx, val in enumerate(a):
203-
if idx == len(a) - 1:
204-
if len(a) > 4:
196+
path_list[directory] = [file_name]
197+
198+
for directory, files in path_list.items():
199+
print(f"{directory}")
200+
dir_size = len(files)
201+
for idx, file_name in enumerate(files):
202+
if idx == dir_size - 1:
203+
if dir_size > FILES_TO_DISPLAY:
205204
print("│ ....")
206-
print(f"{display_filename_prefix_last}{val}\n")
207-
elif len(a) > 4 and togo != -1:
208-
togo -= 1
209-
print(f"{display_filename_prefix_middle}{val}")
210-
elif len(a) <= 4:
211-
print(f"{display_filename_prefix_middle}{val}")
205+
print(f"{FILENAME_TREE_LAST}{file_name}\n")
206+
elif idx <= FILES_TO_DISPLAY:
207+
print(f"{FILENAME_TREE_MIDDLE}{file_name}")
212208

213209

214210
@dataclasses.dataclass
@@ -268,8 +264,8 @@ def from_document_path(
268264
A document from local document_path.
269265
"""
270266

271-
with open(document_path, "r") as f:
272-
doc = documentai.Document.from_json(f.read())
267+
with open(document_path, "r", encoding="utf-8") as f:
268+
doc = documentai.Document.from_json(f.read(), ignore_unknown_fields=True)
273269

274270
return cls(shards=[doc])
275271

0 commit comments

Comments
 (0)