Skip to content

Commit 2874fc5

Browse files
authored
feat: Validating logged features via Python SDK (#2640)
* simple logged feature validation Signed-off-by: Oleksii Moskalenko <[email protected]> * validate with metadata Signed-off-by: Oleksii Moskalenko <[email protected]> * typos Signed-off-by: Oleksii Moskalenko <[email protected]> * revert entity columns Signed-off-by: Oleksii Moskalenko <[email protected]>
1 parent af57a89 commit 2874fc5

File tree

8 files changed

+312
-62
lines changed

8 files changed

+312
-62
lines changed

sdk/python/feast/dqm/profilers/ge_profiler.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from feast.protos.feast.core.ValidationProfile_pb2 import (
2222
GEValidationProfiler as GEValidationProfilerProto,
2323
)
24+
from feast.protos.feast.serving.ServingService_pb2 import FieldStatus
2425

2526

2627
def _prepare_dataset(dataset: PandasDataset) -> PandasDataset:
@@ -41,6 +42,23 @@ def _prepare_dataset(dataset: PandasDataset) -> PandasDataset:
4142
return dataset_copy
4243

4344

45+
def _add_feature_metadata(dataset: PandasDataset) -> PandasDataset:
46+
for column in dataset.columns:
47+
if "__" not in column:
48+
# not a feature column
49+
continue
50+
51+
if "event_timestamp" in dataset.columns:
52+
dataset[f"{column}__timestamp"] = dataset["event_timestamp"]
53+
54+
dataset[f"{column}__status"] = FieldStatus.PRESENT
55+
dataset[f"{column}__status"] = dataset[f"{column}__status"].mask(
56+
dataset[column].isna(), FieldStatus.NOT_FOUND
57+
)
58+
59+
return dataset
60+
61+
4462
class GEProfile(Profile):
4563
"""
4664
GEProfile is an implementation of abstract Profile for integration with Great Expectations.
@@ -96,9 +114,12 @@ class GEProfiler(Profiler):
96114
"""
97115

98116
def __init__(
99-
self, user_defined_profiler: Callable[[pd.DataFrame], ExpectationSuite]
117+
self,
118+
user_defined_profiler: Callable[[pd.DataFrame], ExpectationSuite],
119+
with_feature_metadata: bool = False,
100120
):
101121
self.user_defined_profiler = user_defined_profiler
122+
self.with_feature_metadata = with_feature_metadata
102123

103124
def analyze_dataset(self, df: pd.DataFrame) -> Profile:
104125
"""
@@ -113,6 +134,9 @@ def analyze_dataset(self, df: pd.DataFrame) -> Profile:
113134

114135
dataset = _prepare_dataset(dataset)
115136

137+
if self.with_feature_metadata:
138+
dataset = _add_feature_metadata(dataset)
139+
116140
return GEProfile(expectation_suite=self.user_defined_profiler(dataset))
117141

118142
def to_proto(self):
@@ -158,5 +182,13 @@ def __repr__(self):
158182
return json.dumps(failed_expectations, indent=2)
159183

160184

161-
def ge_profiler(func):
162-
return GEProfiler(user_defined_profiler=func)
185+
def ge_profiler(*args, with_feature_metadata=False):
186+
def wrapper(fun):
187+
return GEProfiler(
188+
user_defined_profiler=fun, with_feature_metadata=with_feature_metadata
189+
)
190+
191+
if args:
192+
return wrapper(args[0])
193+
194+
return wrapper

sdk/python/feast/feature_logging.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
FeatureViewNotFoundException,
1212
OnDemandFeatureViewNotFoundException,
1313
)
14+
from feast.feature_view import DUMMY_ENTITY_NAME
1415
from feast.protos.feast.core.FeatureService_pb2 import (
1516
LoggingConfig as LoggingConfigProto,
1617
)
@@ -77,7 +78,11 @@ def get_schema(self, registry: "Registry") -> pa.Schema:
7778

7879
else:
7980
for entity_name in feature_view.entities:
81+
if entity_name == DUMMY_ENTITY_NAME:
82+
continue
83+
8084
entity = registry.get_entity(entity_name, self._project)
85+
8186
join_key = projection.join_key_map.get(
8287
entity.join_key, entity.join_key
8388
)

sdk/python/feast/feature_store.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from feast.data_source import DataSource
4646
from feast.diff.infra_diff import InfraDiff, diff_infra_protos
4747
from feast.diff.registry_diff import RegistryDiff, apply_diff_to_registry, diff_between
48+
from feast.dqm.errors import ValidationFailed
4849
from feast.entity import Entity
4950
from feast.errors import (
5051
EntityNotFoundException,
@@ -83,7 +84,7 @@
8384
from feast.repo_config import RepoConfig, load_repo_config
8485
from feast.repo_contents import RepoContents
8586
from feast.request_feature_view import RequestFeatureView
86-
from feast.saved_dataset import SavedDataset, SavedDatasetStorage
87+
from feast.saved_dataset import SavedDataset, SavedDatasetStorage, ValidationReference
8788
from feast.type_map import (
8889
feast_value_type_to_python_type,
8990
python_values_to_proto_values,
@@ -2054,6 +2055,58 @@ def write_logged_features(
20542055
registry=self._registry,
20552056
)
20562057

2058+
def validate_logged_features(
2059+
self,
2060+
source: Union[FeatureService],
2061+
start: datetime,
2062+
end: datetime,
2063+
reference: ValidationReference,
2064+
throw_exception: bool = True,
2065+
) -> Optional[ValidationFailed]:
2066+
"""
2067+
Load logged features from an offline store and validate them against provided validation reference.
2068+
2069+
Args:
2070+
source: Logs source object (currently only feature services are supported)
2071+
start: lower bound for loading logged features
2072+
end: upper bound for loading logged features
2073+
reference: validation reference
2074+
throw_exception: throw exception or return it as a result
2075+
2076+
Returns:
2077+
Throw or return (depends on parameter) ValidationFailed exception if validation was not successful
2078+
or None if successful.
2079+
2080+
"""
2081+
warnings.warn(
2082+
"Logged features validation is an experimental feature. "
2083+
"This API is unstable and it could and most probably will be changed in the future. "
2084+
"We do not guarantee that future changes will maintain backward compatibility.",
2085+
RuntimeWarning,
2086+
)
2087+
2088+
if not isinstance(source, FeatureService):
2089+
raise ValueError("Only feature service is currently supported as a source")
2090+
2091+
j = self._get_provider().retrieve_feature_service_logs(
2092+
feature_service=source,
2093+
start_date=start,
2094+
end_date=end,
2095+
config=self.config,
2096+
registry=self.registry,
2097+
)
2098+
2099+
# read and run validation
2100+
try:
2101+
j.to_arrow(validation_reference=reference)
2102+
except ValidationFailed as exc:
2103+
if throw_exception:
2104+
raise
2105+
2106+
return exc
2107+
2108+
return None
2109+
20572110

20582111
def _validate_entity_values(join_key_values: Dict[str, List[Value]]):
20592112
set_of_row_lengths = {len(v) for v in join_key_values.values()}

sdk/python/feast/feature_view.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ def __copy__(self):
285285
online=self.online,
286286
)
287287
fv.projection = copy.copy(self.projection)
288+
fv.entities = self.entities
288289
return fv
289290

290291
def __eq__(self, other):

sdk/python/feast/infra/offline_stores/file.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]:
7373
def _to_df_internal(self) -> pd.DataFrame:
7474
# Only execute the evaluation function to build the final historical retrieval dataframe at the last moment.
7575
df = self.evaluation_function().compute()
76+
df = df.reset_index(drop=True)
7677
return df
7778

7879
@log_exceptions_and_usage
@@ -555,11 +556,18 @@ def _filter_ttl(
555556
# Filter rows by defined timestamp tolerance
556557
if feature_view.ttl and feature_view.ttl.total_seconds() != 0:
557558
df_to_join = df_to_join[
558-
(
559-
df_to_join[timestamp_field]
560-
>= df_to_join[entity_df_event_timestamp_col] - feature_view.ttl
559+
# do not drop entity rows if one of the sources returns NaNs
560+
df_to_join[timestamp_field].isna()
561+
| (
562+
(
563+
df_to_join[timestamp_field]
564+
>= df_to_join[entity_df_event_timestamp_col] - feature_view.ttl
565+
)
566+
& (
567+
df_to_join[timestamp_field]
568+
<= df_to_join[entity_df_event_timestamp_col]
569+
)
561570
)
562-
& (df_to_join[timestamp_field] <= df_to_join[entity_df_event_timestamp_col])
563571
]
564572

565573
df_to_join = df_to_join.persist()

sdk/python/tests/integration/e2e/test_validation.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
1+
import datetime
2+
13
import pandas as pd
4+
import pyarrow as pa
25
import pytest
36
from great_expectations.core import ExpectationSuite
47
from great_expectations.dataset import PandasDataset
58

9+
from feast import FeatureService
610
from feast.dqm.errors import ValidationFailed
711
from feast.dqm.profilers.ge_profiler import ge_profiler
12+
from feast.feature_logging import (
13+
LOG_TIMESTAMP_FIELD,
14+
FeatureServiceLoggingSource,
15+
LoggingConfig,
16+
)
17+
from feast.protos.feast.serving.ServingService_pb2 import FieldStatus
18+
from feast.wait import wait_retry_backoff
819
from tests.integration.feature_repos.repo_configuration import (
920
construct_universal_feature_views,
1021
)
@@ -13,6 +24,7 @@
1324
driver,
1425
location,
1526
)
27+
from tests.utils.logged_features import prepare_logs
1628

1729
_features = [
1830
"customer_profile:current_balance",
@@ -32,6 +44,39 @@ def configurable_profiler(dataset: PandasDataset) -> ExpectationSuite:
3244

3345
return UserConfigurableProfiler(
3446
profile_dataset=dataset,
47+
ignored_columns=["event_timestamp"],
48+
excluded_expectations=[
49+
"expect_table_columns_to_match_ordered_list",
50+
"expect_table_row_count_to_be_between",
51+
],
52+
value_set_threshold="few",
53+
).build_suite()
54+
55+
56+
@ge_profiler(with_feature_metadata=True)
57+
def profiler_with_feature_metadata(dataset: PandasDataset) -> ExpectationSuite:
58+
from great_expectations.profile.user_configurable_profiler import (
59+
UserConfigurableProfiler,
60+
)
61+
62+
# always present
63+
dataset.expect_column_values_to_be_in_set(
64+
"global_stats__avg_ride_length__status", {FieldStatus.PRESENT}
65+
)
66+
67+
# present at least in 70% of rows
68+
dataset.expect_column_values_to_be_in_set(
69+
"customer_profile__current_balance__status", {FieldStatus.PRESENT}, mostly=0.7
70+
)
71+
72+
return UserConfigurableProfiler(
73+
profile_dataset=dataset,
74+
ignored_columns=["event_timestamp"]
75+
+ [
76+
c
77+
for c in dataset.columns
78+
if c.endswith("__timestamp") or c.endswith("__status")
79+
],
3580
excluded_expectations=[
3681
"expect_table_columns_to_match_ordered_list",
3782
"expect_table_row_count_to_be_between",
@@ -127,3 +172,88 @@ def test_historical_retrieval_fails_on_validation(environment, universal_data_so
127172

128173
assert failed_expectations[1].check_name == "expect_column_values_to_be_in_set"
129174
assert failed_expectations[1].column_name == "avg_passenger_count"
175+
176+
177+
@pytest.mark.integration
178+
def test_logged_features_validation(environment, universal_data_sources):
179+
store = environment.feature_store
180+
181+
(_, datasets, data_sources) = universal_data_sources
182+
feature_views = construct_universal_feature_views(data_sources)
183+
feature_service = FeatureService(
184+
name="test_service",
185+
features=[
186+
feature_views.customer[
187+
["current_balance", "avg_passenger_count", "lifetime_trip_count"]
188+
],
189+
feature_views.order[["order_is_success"]],
190+
feature_views.global_fv[["num_rides", "avg_ride_length"]],
191+
],
192+
logging_config=LoggingConfig(
193+
destination=environment.data_source_creator.create_logged_features_destination()
194+
),
195+
)
196+
197+
store.apply(
198+
[driver(), customer(), location(), feature_service, *feature_views.values()]
199+
)
200+
201+
entity_df = datasets.entity_df.drop(
202+
columns=["order_id", "origin_id", "destination_id"]
203+
)
204+
205+
# add some non-existing entities to check NotFound feature handling
206+
for i in range(5):
207+
entity_df = entity_df.append(
208+
{
209+
"customer_id": 2000 + i,
210+
"driver_id": 6000 + i,
211+
"event_timestamp": datetime.datetime.now(),
212+
},
213+
ignore_index=True,
214+
)
215+
216+
reference_dataset = store.create_saved_dataset(
217+
from_=store.get_historical_features(
218+
entity_df=entity_df, features=feature_service, full_feature_names=True
219+
),
220+
name="reference_for_validating_logged_features",
221+
storage=environment.data_source_creator.create_saved_dataset_destination(),
222+
)
223+
224+
log_source_df = store.get_historical_features(
225+
entity_df=entity_df, features=feature_service, full_feature_names=False
226+
).to_df()
227+
logs_df = prepare_logs(log_source_df, feature_service, store)
228+
229+
schema = FeatureServiceLoggingSource(
230+
feature_service=feature_service, project=store.project
231+
).get_schema(store._registry)
232+
store.write_logged_features(
233+
pa.Table.from_pandas(logs_df, schema=schema), source=feature_service
234+
)
235+
236+
def validate():
237+
"""
238+
Return Tuple[succeed, completed]
239+
Succeed will be True if no ValidateFailed exception was raised
240+
"""
241+
try:
242+
store.validate_logged_features(
243+
feature_service,
244+
start=logs_df[LOG_TIMESTAMP_FIELD].min(),
245+
end=logs_df[LOG_TIMESTAMP_FIELD].max() + datetime.timedelta(seconds=1),
246+
reference=reference_dataset.as_reference(
247+
profiler=profiler_with_feature_metadata
248+
),
249+
)
250+
except ValidationFailed:
251+
return False, True
252+
except Exception:
253+
# log table is still being created
254+
return False, False
255+
256+
return True, True
257+
258+
success = wait_retry_backoff(validate, timeout_secs=30)
259+
assert success, "Validation failed (unexpectedly)"

0 commit comments

Comments
 (0)