|
1 | 1 | import logging
|
2 |
| -from collections import defaultdict |
3 | 2 | from typing import (
|
4 | 3 | TYPE_CHECKING,
|
5 | 4 | Callable,
|
|
9 | 8 | Optional,
|
10 | 9 | Sequence,
|
11 | 10 | Set,
|
| 11 | + Tuple, |
12 | 12 | TypeVar,
|
13 | 13 | Union,
|
14 | 14 | )
|
15 | 15 |
|
| 16 | +from datahub.emitter.mce_builder import make_dataplatform_instance_urn |
16 | 17 | from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
| 18 | +from datahub.emitter.mcp_builder import PlatformKey |
17 | 19 | from datahub.ingestion.api.workunit import MetadataWorkUnit
|
18 | 20 | from datahub.metadata.schema_classes import (
|
19 | 21 | BrowsePathEntryClass,
|
|
25 | 27 | StatusClass,
|
26 | 28 | TagKeyClass,
|
27 | 29 | )
|
| 30 | +from datahub.telemetry import telemetry |
28 | 31 | from datahub.utilities.urns.tag_urn import TagUrn
|
29 | 32 | from datahub.utilities.urns.urn import guess_entity_type
|
30 | 33 | from datahub.utilities.urns.urn_iter import list_urns
|
@@ -166,68 +169,136 @@ def auto_materialize_referenced_tags(
|
166 | 169 |
|
167 | 170 |
|
168 | 171 | def auto_browse_path_v2(
|
169 |
| - drop_dirs: Sequence[str], |
170 | 172 | stream: Iterable[MetadataWorkUnit],
|
| 173 | + *, |
| 174 | + dry_run: bool = False, |
| 175 | + drop_dirs: Sequence[str] = (), |
| 176 | + platform_key: Optional[PlatformKey] = None, |
171 | 177 | ) -> Iterable[MetadataWorkUnit]:
|
172 |
| - """Generate BrowsePathsV2 from Container and BrowsePaths aspects.""" |
| 178 | + """Generate BrowsePathsV2 from Container and BrowsePaths aspects. |
173 | 179 |
|
174 |
| - ignore_urns: Set[str] = set() |
175 |
| - legacy_browse_paths: Dict[str, List[str]] = defaultdict(list) |
176 |
| - container_urns: Set[str] = set() |
177 |
| - parent_container_map: Dict[str, str] = {} |
178 |
| - children: Dict[str, List[str]] = defaultdict(list) |
179 |
| - for wu in stream: |
180 |
| - yield wu |
| 180 | + Generates browse paths v2 on demand, rather than waiting for end of ingestion, |
| 181 | + for better UI experience while ingestion is running. |
181 | 182 |
|
182 |
| - urn = wu.get_urn() |
183 |
| - if guess_entity_type(urn) == "container": |
184 |
| - container_urns.add(urn) |
185 |
| - |
186 |
| - container_aspects = wu.get_aspects_of_type(ContainerClass) |
187 |
| - for c_aspect in container_aspects: |
188 |
| - parent = c_aspect.container |
189 |
| - parent_container_map[urn] = parent |
190 |
| - children[parent].append(urn) |
191 |
| - |
192 |
| - browse_path_aspects = wu.get_aspects_of_type(BrowsePathsClass) |
193 |
| - for b_aspect in browse_path_aspects: |
194 |
| - if b_aspect.paths: |
195 |
| - path = b_aspect.paths[0] # Only take first path |
196 |
| - legacy_browse_paths[urn] = [ |
197 |
| - p for p in path.strip("/").split("/") if p.strip() not in drop_dirs |
| 183 | + To do this, assumes entities in container structure arrive in topological order |
| 184 | + and that all relevant aspects (Container, BrowsePaths, BrowsePathsV2) for an urn |
| 185 | + arrive together in a batch. |
| 186 | +
|
| 187 | + Calculates the correct BrowsePathsV2 at end of workunit stream, |
| 188 | + and emits "corrections", i.e. a final BrowsePathsV2 for any urns that have changed. |
| 189 | + """ |
| 190 | + |
| 191 | + # For telemetry, to see if our sources violate assumptions |
| 192 | + num_out_of_order = 0 |
| 193 | + num_out_of_batch = 0 |
| 194 | + |
| 195 | + # Set for all containers and urns with a Container aspect |
| 196 | + # Used to construct container paths while iterating through stream |
| 197 | + # Assumes topological order of entities in stream |
| 198 | + paths: Dict[str, List[BrowsePathEntryClass]] = {} |
| 199 | + |
| 200 | + emitted_urns: Set[str] = set() |
| 201 | + containers_used_as_parent: Set[str] = set() |
| 202 | + for urn, batch in _batch_workunits_by_urn(stream): |
| 203 | + container_path: Optional[List[BrowsePathEntryClass]] = None |
| 204 | + legacy_path: Optional[List[BrowsePathEntryClass]] = None |
| 205 | + has_browse_path_v2 = False |
| 206 | + |
| 207 | + for wu in batch: |
| 208 | + yield wu |
| 209 | + if not wu.is_primary_source: |
| 210 | + continue |
| 211 | + |
| 212 | + container_aspect = wu.get_aspect_of_type(ContainerClass) |
| 213 | + if container_aspect: |
| 214 | + parent_urn = container_aspect.container |
| 215 | + containers_used_as_parent.add(parent_urn) |
| 216 | + paths[urn] = [ |
| 217 | + *paths.setdefault(parent_urn, []), # Guess parent has no parents |
| 218 | + BrowsePathEntryClass(id=parent_urn, urn=parent_urn), |
198 | 219 | ]
|
| 220 | + container_path = paths[urn] |
| 221 | + |
| 222 | + if urn in containers_used_as_parent: |
| 223 | + # Topological order invariant violated; we've used the previous value of paths[urn] |
| 224 | + # TODO: Add sentry alert |
| 225 | + num_out_of_order += 1 |
| 226 | + |
| 227 | + browse_path_aspect = wu.get_aspect_of_type(BrowsePathsClass) |
| 228 | + if browse_path_aspect and browse_path_aspect.paths: |
| 229 | + legacy_path = [ |
| 230 | + BrowsePathEntryClass(id=p.strip()) |
| 231 | + for p in browse_path_aspect.paths[0].strip("/").split("/") |
| 232 | + if p.strip() and p.strip() not in drop_dirs |
| 233 | + ] |
| 234 | + |
| 235 | + if wu.get_aspect_of_type(BrowsePathsV2Class): |
| 236 | + has_browse_path_v2 = True |
| 237 | + |
| 238 | + path = container_path or legacy_path |
| 239 | + if (path is not None or has_browse_path_v2) and urn in emitted_urns: |
| 240 | + # Batch invariant violated |
| 241 | + # TODO: Add sentry alert |
| 242 | + num_out_of_batch += 1 |
| 243 | + elif has_browse_path_v2: |
| 244 | + emitted_urns.add(urn) |
| 245 | + elif path is not None: |
| 246 | + emitted_urns.add(urn) |
| 247 | + if not dry_run: |
| 248 | + yield MetadataChangeProposalWrapper( |
| 249 | + entityUrn=urn, |
| 250 | + aspect=BrowsePathsV2Class( |
| 251 | + path=_prepend_platform_instance(path, platform_key) |
| 252 | + ), |
| 253 | + ).as_workunit() |
| 254 | + elif urn not in emitted_urns and guess_entity_type(urn) == "container": |
| 255 | + # Root containers have no Container aspect, so they are not handled above |
| 256 | + emitted_urns.add(urn) |
| 257 | + if not dry_run: |
| 258 | + yield MetadataChangeProposalWrapper( |
| 259 | + entityUrn=urn, |
| 260 | + aspect=BrowsePathsV2Class( |
| 261 | + path=_prepend_platform_instance([], platform_key) |
| 262 | + ), |
| 263 | + ).as_workunit() |
| 264 | + |
| 265 | + if num_out_of_batch or num_out_of_order: |
| 266 | + properties = { |
| 267 | + "platform": platform_key.platform if platform_key else None, |
| 268 | + "has_platform_instance": bool(platform_key.instance) |
| 269 | + if platform_key |
| 270 | + else False, |
| 271 | + "num_out_of_batch": num_out_of_batch, |
| 272 | + "num_out_of_order": num_out_of_order, |
| 273 | + } |
| 274 | + telemetry.telemetry_instance.ping("incorrect_browse_path_v2", properties) |
| 275 | + |
| 276 | + |
| 277 | +def _batch_workunits_by_urn( |
| 278 | + stream: Iterable[MetadataWorkUnit], |
| 279 | +) -> Iterable[Tuple[str, List[MetadataWorkUnit]]]: |
| 280 | + batch: List[MetadataWorkUnit] = [] |
| 281 | + batch_urn: Optional[str] = None |
| 282 | + for wu in stream: |
| 283 | + if wu.get_urn() != batch_urn: |
| 284 | + if batch_urn is not None: |
| 285 | + yield batch_urn, batch |
| 286 | + batch = [] |
199 | 287 |
|
200 |
| - if wu.get_aspects_of_type(BrowsePathsV2Class): |
201 |
| - ignore_urns.add(urn) |
| 288 | + batch.append(wu) |
| 289 | + batch_urn = wu.get_urn() |
202 | 290 |
|
203 |
| - paths: Dict[str, List[str]] = {} # Maps urn -> list of urns in path |
204 |
| - # Yield browse paths v2 in topological order, starting with root containers |
205 |
| - processed_urns = set() |
206 |
| - nodes = container_urns - parent_container_map.keys() |
207 |
| - while nodes: |
208 |
| - node = nodes.pop() |
209 |
| - nodes.update(children[node]) |
| 291 | + if batch_urn is not None: |
| 292 | + yield batch_urn, batch |
210 | 293 |
|
211 |
| - if node not in parent_container_map: # root |
212 |
| - paths[node] = [] |
213 |
| - else: |
214 |
| - parent = parent_container_map[node] |
215 |
| - paths[node] = [*paths[parent], parent] |
216 |
| - if node not in ignore_urns: |
217 |
| - yield MetadataChangeProposalWrapper( |
218 |
| - entityUrn=node, |
219 |
| - aspect=BrowsePathsV2Class( |
220 |
| - path=[BrowsePathEntryClass(id=urn, urn=urn) for urn in paths[node]] |
221 |
| - ), |
222 |
| - ).as_workunit() |
223 |
| - processed_urns.add(node) |
224 |
| - |
225 |
| - # Yield browse paths v2 based on browse paths v1 (legacy) |
226 |
| - # Only done if the entity is not part of a container hierarchy |
227 |
| - for urn in legacy_browse_paths.keys() - processed_urns - ignore_urns: |
228 |
| - yield MetadataChangeProposalWrapper( |
229 |
| - entityUrn=urn, |
230 |
| - aspect=BrowsePathsV2Class( |
231 |
| - path=[BrowsePathEntryClass(id=p) for p in legacy_browse_paths[urn]] |
232 |
| - ), |
233 |
| - ).as_workunit() |
| 294 | + |
| 295 | +def _prepend_platform_instance( |
| 296 | + entries: List[BrowsePathEntryClass], platform_key: Optional[PlatformKey] |
| 297 | +) -> List[BrowsePathEntryClass]: |
| 298 | + if platform_key and platform_key.instance: |
| 299 | + urn = make_dataplatform_instance_urn( |
| 300 | + platform_key.platform, platform_key.instance |
| 301 | + ) |
| 302 | + return [BrowsePathEntryClass(id=urn, urn=urn)] + entries |
| 303 | + |
| 304 | + return entries |
0 commit comments