Skip to content

Commit f2c66fd

Browse files
asikowitzhsheth2pedro93
authored
feat(ingest): Produce browse paths v2 on demand and with platform instance (#8173)
Co-authored-by: Harshal Sheth <[email protected]> Co-authored-by: Pedro Silva <[email protected]>
1 parent 43d37ff commit f2c66fd

File tree

6 files changed

+394
-139
lines changed

6 files changed

+394
-139
lines changed

metadata-ingestion/src/datahub/ingestion/api/source.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
from pydantic import BaseModel
2323

2424
from datahub.configuration.common import ConfigModel
25-
from datahub.emitter.mcp_builder import mcps_from_mce
25+
from datahub.configuration.source_common import PlatformInstanceConfigMixin
26+
from datahub.emitter.mcp_builder import PlatformKey, mcps_from_mce
2627
from datahub.ingestion.api.closeable import Closeable
2728
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
2829
from datahub.ingestion.api.report import Report
@@ -187,26 +188,15 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
187188
self.ctx.pipeline_config
188189
and self.ctx.pipeline_config.flags.generate_browse_path_v2
189190
):
190-
platform = getattr(self, "platform", None) or getattr(
191-
self.get_config(), "platform", None
192-
)
193-
env = getattr(self.get_config(), "env", None)
194-
browse_path_drop_dirs = [
195-
platform,
196-
platform and platform.lower(),
197-
env,
198-
env and env.lower(),
199-
]
200-
browse_path_processor = partial(
201-
auto_browse_path_v2,
202-
[s for s in browse_path_drop_dirs if s is not None],
191+
browse_path_processor = self._get_browse_path_processor(
192+
self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run
203193
)
204194

205195
return [
206196
auto_status_aspect,
207197
auto_materialize_referenced_tags,
208-
partial(auto_workunit_reporter, self.get_report()),
209198
browse_path_processor,
199+
partial(auto_workunit_reporter, self.get_report()),
210200
]
211201

212202
@staticmethod
@@ -248,6 +238,34 @@ def get_report(self) -> SourceReport:
248238
def close(self) -> None:
249239
pass
250240

241+
def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
242+
config = self.get_config()
243+
platform = getattr(self, "platform", None) or getattr(config, "platform", None)
244+
env = getattr(config, "env", None)
245+
browse_path_drop_dirs = [
246+
platform,
247+
platform and platform.lower(),
248+
env,
249+
env and env.lower(),
250+
]
251+
252+
platform_key: Optional[PlatformKey] = None
253+
if (
254+
platform
255+
and isinstance(config, PlatformInstanceConfigMixin)
256+
and config.platform_instance
257+
):
258+
platform_key = PlatformKey(
259+
platform=platform, instance=config.platform_instance
260+
)
261+
262+
return partial(
263+
auto_browse_path_v2,
264+
platform_key=platform_key,
265+
drop_dirs=[s for s in browse_path_drop_dirs if s is not None],
266+
dry_run=dry_run,
267+
)
268+
251269

252270
class TestableSource(Source):
253271
@staticmethod

metadata-ingestion/src/datahub/ingestion/api/source_helpers.py

Lines changed: 129 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
from collections import defaultdict
32
from typing import (
43
TYPE_CHECKING,
54
Callable,
@@ -9,11 +8,14 @@
98
Optional,
109
Sequence,
1110
Set,
11+
Tuple,
1212
TypeVar,
1313
Union,
1414
)
1515

16+
from datahub.emitter.mce_builder import make_dataplatform_instance_urn
1617
from datahub.emitter.mcp import MetadataChangeProposalWrapper
18+
from datahub.emitter.mcp_builder import PlatformKey
1719
from datahub.ingestion.api.workunit import MetadataWorkUnit
1820
from datahub.metadata.schema_classes import (
1921
BrowsePathEntryClass,
@@ -25,6 +27,7 @@
2527
StatusClass,
2628
TagKeyClass,
2729
)
30+
from datahub.telemetry import telemetry
2831
from datahub.utilities.urns.tag_urn import TagUrn
2932
from datahub.utilities.urns.urn import guess_entity_type
3033
from datahub.utilities.urns.urn_iter import list_urns
@@ -166,68 +169,136 @@ def auto_materialize_referenced_tags(
166169

167170

168171
def auto_browse_path_v2(
169-
drop_dirs: Sequence[str],
170172
stream: Iterable[MetadataWorkUnit],
173+
*,
174+
dry_run: bool = False,
175+
drop_dirs: Sequence[str] = (),
176+
platform_key: Optional[PlatformKey] = None,
171177
) -> Iterable[MetadataWorkUnit]:
172-
"""Generate BrowsePathsV2 from Container and BrowsePaths aspects."""
178+
"""Generate BrowsePathsV2 from Container and BrowsePaths aspects.
173179
174-
ignore_urns: Set[str] = set()
175-
legacy_browse_paths: Dict[str, List[str]] = defaultdict(list)
176-
container_urns: Set[str] = set()
177-
parent_container_map: Dict[str, str] = {}
178-
children: Dict[str, List[str]] = defaultdict(list)
179-
for wu in stream:
180-
yield wu
180+
Generates browse paths v2 on demand, rather than waiting for end of ingestion,
181+
for better UI experience while ingestion is running.
181182
182-
urn = wu.get_urn()
183-
if guess_entity_type(urn) == "container":
184-
container_urns.add(urn)
185-
186-
container_aspects = wu.get_aspects_of_type(ContainerClass)
187-
for c_aspect in container_aspects:
188-
parent = c_aspect.container
189-
parent_container_map[urn] = parent
190-
children[parent].append(urn)
191-
192-
browse_path_aspects = wu.get_aspects_of_type(BrowsePathsClass)
193-
for b_aspect in browse_path_aspects:
194-
if b_aspect.paths:
195-
path = b_aspect.paths[0] # Only take first path
196-
legacy_browse_paths[urn] = [
197-
p for p in path.strip("/").split("/") if p.strip() not in drop_dirs
183+
To do this, assumes entities in container structure arrive in topological order
184+
and that all relevant aspects (Container, BrowsePaths, BrowsePathsV2) for an urn
185+
arrive together in a batch.
186+
187+
Calculates the correct BrowsePathsV2 at end of workunit stream,
188+
and emits "corrections", i.e. a final BrowsePathsV2 for any urns that have changed.
189+
"""
190+
191+
# For telemetry, to see if our sources violate assumptions
192+
num_out_of_order = 0
193+
num_out_of_batch = 0
194+
195+
# Set for all containers and urns with a Container aspect
196+
# Used to construct container paths while iterating through stream
197+
# Assumes topological order of entities in stream
198+
paths: Dict[str, List[BrowsePathEntryClass]] = {}
199+
200+
emitted_urns: Set[str] = set()
201+
containers_used_as_parent: Set[str] = set()
202+
for urn, batch in _batch_workunits_by_urn(stream):
203+
container_path: Optional[List[BrowsePathEntryClass]] = None
204+
legacy_path: Optional[List[BrowsePathEntryClass]] = None
205+
has_browse_path_v2 = False
206+
207+
for wu in batch:
208+
yield wu
209+
if not wu.is_primary_source:
210+
continue
211+
212+
container_aspect = wu.get_aspect_of_type(ContainerClass)
213+
if container_aspect:
214+
parent_urn = container_aspect.container
215+
containers_used_as_parent.add(parent_urn)
216+
paths[urn] = [
217+
*paths.setdefault(parent_urn, []), # Guess parent has no parents
218+
BrowsePathEntryClass(id=parent_urn, urn=parent_urn),
198219
]
220+
container_path = paths[urn]
221+
222+
if urn in containers_used_as_parent:
223+
# Topological order invariant violated; we've used the previous value of paths[urn]
224+
# TODO: Add sentry alert
225+
num_out_of_order += 1
226+
227+
browse_path_aspect = wu.get_aspect_of_type(BrowsePathsClass)
228+
if browse_path_aspect and browse_path_aspect.paths:
229+
legacy_path = [
230+
BrowsePathEntryClass(id=p.strip())
231+
for p in browse_path_aspect.paths[0].strip("/").split("/")
232+
if p.strip() and p.strip() not in drop_dirs
233+
]
234+
235+
if wu.get_aspect_of_type(BrowsePathsV2Class):
236+
has_browse_path_v2 = True
237+
238+
path = container_path or legacy_path
239+
if (path is not None or has_browse_path_v2) and urn in emitted_urns:
240+
# Batch invariant violated
241+
# TODO: Add sentry alert
242+
num_out_of_batch += 1
243+
elif has_browse_path_v2:
244+
emitted_urns.add(urn)
245+
elif path is not None:
246+
emitted_urns.add(urn)
247+
if not dry_run:
248+
yield MetadataChangeProposalWrapper(
249+
entityUrn=urn,
250+
aspect=BrowsePathsV2Class(
251+
path=_prepend_platform_instance(path, platform_key)
252+
),
253+
).as_workunit()
254+
elif urn not in emitted_urns and guess_entity_type(urn) == "container":
255+
# Root containers have no Container aspect, so they are not handled above
256+
emitted_urns.add(urn)
257+
if not dry_run:
258+
yield MetadataChangeProposalWrapper(
259+
entityUrn=urn,
260+
aspect=BrowsePathsV2Class(
261+
path=_prepend_platform_instance([], platform_key)
262+
),
263+
).as_workunit()
264+
265+
if num_out_of_batch or num_out_of_order:
266+
properties = {
267+
"platform": platform_key.platform if platform_key else None,
268+
"has_platform_instance": bool(platform_key.instance)
269+
if platform_key
270+
else False,
271+
"num_out_of_batch": num_out_of_batch,
272+
"num_out_of_order": num_out_of_order,
273+
}
274+
telemetry.telemetry_instance.ping("incorrect_browse_path_v2", properties)
275+
276+
277+
def _batch_workunits_by_urn(
278+
stream: Iterable[MetadataWorkUnit],
279+
) -> Iterable[Tuple[str, List[MetadataWorkUnit]]]:
280+
batch: List[MetadataWorkUnit] = []
281+
batch_urn: Optional[str] = None
282+
for wu in stream:
283+
if wu.get_urn() != batch_urn:
284+
if batch_urn is not None:
285+
yield batch_urn, batch
286+
batch = []
199287

200-
if wu.get_aspects_of_type(BrowsePathsV2Class):
201-
ignore_urns.add(urn)
288+
batch.append(wu)
289+
batch_urn = wu.get_urn()
202290

203-
paths: Dict[str, List[str]] = {} # Maps urn -> list of urns in path
204-
# Yield browse paths v2 in topological order, starting with root containers
205-
processed_urns = set()
206-
nodes = container_urns - parent_container_map.keys()
207-
while nodes:
208-
node = nodes.pop()
209-
nodes.update(children[node])
291+
if batch_urn is not None:
292+
yield batch_urn, batch
210293

211-
if node not in parent_container_map: # root
212-
paths[node] = []
213-
else:
214-
parent = parent_container_map[node]
215-
paths[node] = [*paths[parent], parent]
216-
if node not in ignore_urns:
217-
yield MetadataChangeProposalWrapper(
218-
entityUrn=node,
219-
aspect=BrowsePathsV2Class(
220-
path=[BrowsePathEntryClass(id=urn, urn=urn) for urn in paths[node]]
221-
),
222-
).as_workunit()
223-
processed_urns.add(node)
224-
225-
# Yield browse paths v2 based on browse paths v1 (legacy)
226-
# Only done if the entity is not part of a container hierarchy
227-
for urn in legacy_browse_paths.keys() - processed_urns - ignore_urns:
228-
yield MetadataChangeProposalWrapper(
229-
entityUrn=urn,
230-
aspect=BrowsePathsV2Class(
231-
path=[BrowsePathEntryClass(id=p) for p in legacy_browse_paths[urn]]
232-
),
233-
).as_workunit()
294+
295+
def _prepend_platform_instance(
296+
entries: List[BrowsePathEntryClass], platform_key: Optional[PlatformKey]
297+
) -> List[BrowsePathEntryClass]:
298+
if platform_key and platform_key.instance:
299+
urn = make_dataplatform_instance_urn(
300+
platform_key.platform, platform_key.instance
301+
)
302+
return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
303+
304+
return entries

metadata-ingestion/src/datahub/ingestion/api/workunit.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
import logging
12
from dataclasses import dataclass
2-
from typing import Iterable, List, Optional, Type, TypeVar, Union, overload
3+
from typing import Iterable, Optional, Type, TypeVar, Union, overload
34

45
from deprecated import deprecated
56

@@ -11,6 +12,8 @@
1112
)
1213
from datahub.metadata.schema_classes import UsageAggregationClass, _Aspect
1314

15+
logger = logging.getLogger(__name__)
16+
1417
T_Aspect = TypeVar("T_Aspect", bound=_Aspect)
1518

1619

@@ -90,7 +93,7 @@ def get_urn(self) -> str:
9093
assert self.metadata.entityUrn
9194
return self.metadata.entityUrn
9295

93-
def get_aspects_of_type(self, aspect_cls: Type[T_Aspect]) -> List[T_Aspect]:
96+
def get_aspect_of_type(self, aspect_cls: Type[T_Aspect]) -> Optional[T_Aspect]:
9497
aspects: list
9598
if isinstance(self.metadata, MetadataChangeEvent):
9699
aspects = self.metadata.proposedSnapshot.aspects
@@ -109,7 +112,10 @@ def get_aspects_of_type(self, aspect_cls: Type[T_Aspect]) -> List[T_Aspect]:
109112
else:
110113
raise ValueError(f"Unexpected type {type(self.metadata)}")
111114

112-
return [a for a in aspects if isinstance(a, aspect_cls)]
115+
aspects = [a for a in aspects if isinstance(a, aspect_cls)]
116+
if len(aspects) > 1:
117+
logger.warning(f"Found multiple aspects of type {aspect_cls} in MCE {self}")
118+
return aspects[-1] if aspects else None
113119

114120
def decompose_mce_into_mcps(self) -> Iterable["MetadataWorkUnit"]:
115121
from datahub.emitter.mcp_builder import mcps_from_mce

metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,18 @@ class FlagsConfig(ConfigModel):
4545
"""
4646

4747
generate_browse_path_v2: bool = Field(
48-
default=False,
48+
default=True,
4949
description="Generate BrowsePathsV2 aspects from container hierarchy and existing BrowsePaths aspects.",
5050
)
5151

52+
generate_browse_path_v2_dry_run: bool = Field(
53+
default=True,
54+
description=(
55+
"Run through browse paths v2 generation but do not actually write the aspects to DataHub. "
56+
"Requires `generate_browse_path_v2` to also be enabled."
57+
),
58+
)
59+
5260

5361
class PipelineConfig(ConfigModel):
5462
# Once support for discriminated unions gets merged into Pydantic, we can
@@ -58,7 +66,7 @@ class PipelineConfig(ConfigModel):
5866
source: SourceConfig
5967
sink: DynamicTypedConfig
6068
transformers: Optional[List[DynamicTypedConfig]]
61-
flags: FlagsConfig = Field(default=FlagsConfig())
69+
flags: FlagsConfig = Field(default=FlagsConfig(), hidden_from_docs=True)
6270
reporting: List[ReporterConfig] = []
6371
run_id: str = DEFAULT_RUN_ID
6472
datahub_api: Optional[DatahubClientConfig] = None

0 commit comments

Comments
 (0)