Skip to content

Commit 6f79069

Browse files
authored
feat: Add an experimental lambda-based materialization engine (#2923)
* feat: Add an experimental lambda-based materialization engine Signed-off-by: Achal Shah <[email protected]> * setup and teardown lambda func Signed-off-by: Achal Shah <[email protected]> * actually get the test working correctly Signed-off-by: Achal Shah <[email protected]> * actually get the test working correctly Signed-off-by: Achal Shah <[email protected]> * parallelize with threads Signed-off-by: Achal Shah <[email protected]> * super call Signed-off-by: Achal Shah <[email protected]> * fix bugs Signed-off-by: Achal Shah <[email protected]> * fix tests Signed-off-by: Achal Shah <[email protected]> * fix tests Signed-off-by: Achal Shah <[email protected]> * undo unintended changes Signed-off-by: Achal Shah <[email protected]>
1 parent 054446c commit 6f79069

File tree

12 files changed

+655
-17
lines changed

12 files changed

+655
-17
lines changed

sdk/python/feast/infra/aws.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,15 @@ def update_infra(
106106

107107
self._deploy_feature_server(project, image_uri)
108108

109+
if self.batch_engine:
110+
self.batch_engine.update(
111+
project,
112+
tables_to_delete,
113+
tables_to_keep,
114+
entities_to_delete,
115+
entities_to_keep,
116+
)
117+
109118
def _deploy_feature_server(self, project: str, image_uri: str):
110119
_logger.info("Deploying feature server...")
111120

@@ -198,8 +207,7 @@ def _deploy_feature_server(self, project: str, image_uri: str):
198207
def teardown_infra(
199208
self, project: str, tables: Sequence[FeatureView], entities: Sequence[Entity],
200209
) -> None:
201-
if self.online_store:
202-
self.online_store.teardown(self.repo_config, tables, entities)
210+
super(AwsProvider, self).teardown_infra(project, tables, entities)
203211

204212
if (
205213
self.repo_config.feature_server is not None
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM public.ecr.aws/lambda/python:3.9
2+
3+
RUN yum install -y git
4+
5+
6+
# Copy app handler code
7+
COPY sdk/python/feast/infra/materialization/lambda/app.py ${LAMBDA_TASK_ROOT}
8+
9+
# Copy necessary parts of the Feast codebase
10+
COPY sdk/python sdk/python
11+
COPY protos protos
12+
COPY go go
13+
COPY setup.py setup.py
14+
COPY pyproject.toml pyproject.toml
15+
COPY README.md README.md
16+
17+
# Install Feast for AWS with Lambda dependencies
18+
# We need this mount thingy because setuptools_scm needs access to the
19+
# git dir to infer the version of feast we're installing.
20+
# https://github.com/pypa/setuptools_scm#usage-from-docker
21+
# I think it also assumes that this dockerfile is being built from the root of the directory.
22+
RUN --mount=source=.git,target=.git,type=bind pip3 install --no-cache-dir -e '.[aws,redis]'
23+
24+
# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
25+
CMD [ "app.handler" ]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from .lambda_engine import (
2+
LambdaMaterializationEngine,
3+
LambdaMaterializationEngineConfig,
4+
LambdaMaterializationJob,
5+
)
6+
7+
__all__ = [
8+
"LambdaMaterializationEngineConfig",
9+
"LambdaMaterializationJob",
10+
"LambdaMaterializationEngine",
11+
]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import base64
2+
import json
3+
import sys
4+
import tempfile
5+
import traceback
6+
from pathlib import Path
7+
8+
import pyarrow.parquet as pq
9+
10+
from feast import FeatureStore
11+
from feast.constants import FEATURE_STORE_YAML_ENV_NAME
12+
from feast.infra.materialization.local_engine import DEFAULT_BATCH_SIZE
13+
from feast.utils import _convert_arrow_to_proto, _run_pyarrow_field_mapping
14+
15+
16+
def handler(event, context):
17+
"""Provide an event that contains the following keys:
18+
19+
- operation: one of the operations in the operations dict below
20+
- tableName: required for operations that interact with DynamoDB
21+
- payload: a parameter to pass to the operation being performed
22+
"""
23+
print("Received event: " + json.dumps(event, indent=2), flush=True)
24+
25+
try:
26+
27+
config_base64 = event[FEATURE_STORE_YAML_ENV_NAME]
28+
29+
config_bytes = base64.b64decode(config_base64)
30+
31+
# Create a new unique directory for writing feature_store.yaml
32+
repo_path = Path(tempfile.mkdtemp())
33+
34+
with open(repo_path / "feature_store.yaml", "wb") as f:
35+
f.write(config_bytes)
36+
37+
# Initialize the feature store
38+
store = FeatureStore(repo_path=str(repo_path.resolve()))
39+
40+
view_name = event["view_name"]
41+
view_type = event["view_type"]
42+
path = event["path"]
43+
44+
bucket = path[len("s3://") :].split("/", 1)[0]
45+
key = path[len("s3://") :].split("/", 1)[1]
46+
print(f"Inferred Bucket: `{bucket}` Key: `{key}`", flush=True)
47+
48+
if view_type == "batch":
49+
# TODO: This probably needs to be become `store.get_batch_feature_view` at some point.
50+
feature_view = store.get_feature_view(view_name)
51+
else:
52+
feature_view = store.get_stream_feature_view(view_name)
53+
54+
print(f"Got Feature View: `{feature_view}`", flush=True)
55+
56+
table = pq.read_table(path)
57+
if feature_view.batch_source.field_mapping is not None:
58+
table = _run_pyarrow_field_mapping(
59+
table, feature_view.batch_source.field_mapping
60+
)
61+
62+
join_key_to_value_type = {
63+
entity.name: entity.dtype.to_value_type()
64+
for entity in feature_view.entity_columns
65+
}
66+
67+
written_rows = 0
68+
69+
for batch in table.to_batches(DEFAULT_BATCH_SIZE):
70+
rows_to_write = _convert_arrow_to_proto(
71+
batch, feature_view, join_key_to_value_type
72+
)
73+
store._provider.online_write_batch(
74+
store.config, feature_view, rows_to_write, lambda x: None,
75+
)
76+
written_rows += len(rows_to_write)
77+
return {"written_rows": written_rows}
78+
except Exception as e:
79+
print(f"Exception: {e}", flush=True)
80+
print("Traceback:", flush=True)
81+
print(traceback.format_exc(), flush=True)
82+
sys.exit(1)
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
import base64
2+
import json
3+
import logging
4+
from concurrent.futures import ThreadPoolExecutor, wait
5+
from dataclasses import dataclass
6+
from datetime import datetime
7+
from typing import Callable, List, Literal, Optional, Sequence, Union
8+
9+
import boto3
10+
from pydantic import StrictStr
11+
from tqdm import tqdm
12+
13+
from feast.batch_feature_view import BatchFeatureView
14+
from feast.constants import FEATURE_STORE_YAML_ENV_NAME
15+
from feast.entity import Entity
16+
from feast.feature_view import FeatureView
17+
from feast.infra.materialization.batch_materialization_engine import (
18+
BatchMaterializationEngine,
19+
MaterializationJob,
20+
MaterializationJobStatus,
21+
MaterializationTask,
22+
)
23+
from feast.infra.offline_stores.offline_store import OfflineStore
24+
from feast.infra.online_stores.online_store import OnlineStore
25+
from feast.registry import BaseRegistry
26+
from feast.repo_config import FeastConfigBaseModel, RepoConfig
27+
from feast.stream_feature_view import StreamFeatureView
28+
from feast.utils import _get_column_names
29+
from feast.version import get_version
30+
31+
DEFAULT_BATCH_SIZE = 10_000
32+
33+
logger = logging.getLogger(__name__)
34+
35+
36+
class LambdaMaterializationEngineConfig(FeastConfigBaseModel):
37+
"""Batch Materialization Engine config for lambda based engine"""
38+
39+
type: Literal["lambda"] = "lambda"
40+
""" Type selector"""
41+
42+
materialization_image: StrictStr
43+
""" The URI of a container image in the Amazon ECR registry, which should be used for materialization. """
44+
45+
lambda_role: StrictStr
46+
""" Role that should be used by the materialization lambda """
47+
48+
49+
@dataclass
50+
class LambdaMaterializationJob(MaterializationJob):
51+
def __init__(self, job_id: str, status: MaterializationJobStatus) -> None:
52+
super().__init__()
53+
self._job_id: str = job_id
54+
self._status = status
55+
self._error = None
56+
57+
def status(self) -> MaterializationJobStatus:
58+
return self._status
59+
60+
def error(self) -> Optional[BaseException]:
61+
return self._error
62+
63+
def should_be_retried(self) -> bool:
64+
return False
65+
66+
def job_id(self) -> str:
67+
return self._job_id
68+
69+
def url(self) -> Optional[str]:
70+
return None
71+
72+
73+
class LambdaMaterializationEngine(BatchMaterializationEngine):
74+
"""
75+
WARNING: This engine should be considered "Alpha" functionality.
76+
"""
77+
78+
def update(
79+
self,
80+
project: str,
81+
views_to_delete: Sequence[
82+
Union[BatchFeatureView, StreamFeatureView, FeatureView]
83+
],
84+
views_to_keep: Sequence[
85+
Union[BatchFeatureView, StreamFeatureView, FeatureView]
86+
],
87+
entities_to_delete: Sequence[Entity],
88+
entities_to_keep: Sequence[Entity],
89+
):
90+
# This should be setting up the lambda function.
91+
r = self.lambda_client.create_function(
92+
FunctionName=self.lambda_name,
93+
PackageType="Image",
94+
Role=self.repo_config.batch_engine.lambda_role,
95+
Code={"ImageUri": self.repo_config.batch_engine.materialization_image},
96+
Timeout=600,
97+
Tags={
98+
"feast-owned": "True",
99+
"project": project,
100+
"feast-sdk-version": get_version(),
101+
},
102+
)
103+
logger.info("Creating lambda function %s, %s", self.lambda_name, r)
104+
105+
logger.info("Waiting for function %s to be active", self.lambda_name)
106+
waiter = self.lambda_client.get_waiter("function_active")
107+
waiter.wait(FunctionName=self.lambda_name)
108+
109+
def teardown_infra(
110+
self,
111+
project: str,
112+
fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]],
113+
entities: Sequence[Entity],
114+
):
115+
# This should be tearing down the lambda function.
116+
logger.info("Tearing down lambda %s", self.lambda_name)
117+
r = self.lambda_client.delete_function(FunctionName=self.lambda_name)
118+
logger.info("Finished tearing down lambda %s: %s", self.lambda_name, r)
119+
120+
def __init__(
121+
self,
122+
*,
123+
repo_config: RepoConfig,
124+
offline_store: OfflineStore,
125+
online_store: OnlineStore,
126+
**kwargs,
127+
):
128+
super().__init__(
129+
repo_config=repo_config,
130+
offline_store=offline_store,
131+
online_store=online_store,
132+
**kwargs,
133+
)
134+
repo_path = self.repo_config.repo_path
135+
assert repo_path
136+
feature_store_path = repo_path / "feature_store.yaml"
137+
self.feature_store_base64 = str(
138+
base64.b64encode(bytes(feature_store_path.read_text(), "UTF-8")), "UTF-8"
139+
)
140+
141+
self.lambda_name = f"feast-materialize-{self.repo_config.project}"
142+
if len(self.lambda_name) > 64:
143+
self.lambda_name = self.lambda_name[:64]
144+
self.lambda_client = boto3.client("lambda")
145+
146+
def materialize(
147+
self, registry, tasks: List[MaterializationTask]
148+
) -> List[MaterializationJob]:
149+
return [
150+
self._materialize_one(
151+
registry,
152+
task.feature_view,
153+
task.start_time,
154+
task.end_time,
155+
task.project,
156+
task.tqdm_builder,
157+
)
158+
for task in tasks
159+
]
160+
161+
def _materialize_one(
162+
self,
163+
registry: BaseRegistry,
164+
feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView],
165+
start_date: datetime,
166+
end_date: datetime,
167+
project: str,
168+
tqdm_builder: Callable[[int], tqdm],
169+
):
170+
entities = []
171+
for entity_name in feature_view.entities:
172+
entities.append(registry.get_entity(entity_name, project))
173+
174+
(
175+
join_key_columns,
176+
feature_name_columns,
177+
timestamp_field,
178+
created_timestamp_column,
179+
) = _get_column_names(feature_view, entities)
180+
181+
job_id = f"{feature_view.name}-{start_date}-{end_date}"
182+
183+
offline_job = self.offline_store.pull_latest_from_table_or_query(
184+
config=self.repo_config,
185+
data_source=feature_view.batch_source,
186+
join_key_columns=join_key_columns,
187+
feature_name_columns=feature_name_columns,
188+
timestamp_field=timestamp_field,
189+
created_timestamp_column=created_timestamp_column,
190+
start_date=start_date,
191+
end_date=end_date,
192+
)
193+
194+
paths = offline_job.to_remote_storage()
195+
max_workers = len(paths) if len(paths) <= 20 else 20
196+
executor = ThreadPoolExecutor(max_workers=max_workers)
197+
futures = []
198+
199+
for path in paths:
200+
payload = {
201+
FEATURE_STORE_YAML_ENV_NAME: self.feature_store_base64,
202+
"view_name": feature_view.name,
203+
"view_type": "batch",
204+
"path": path,
205+
}
206+
# Invoke a lambda to materialize this file.
207+
208+
logger.info("Invoking materialization for %s", path)
209+
futures.append(
210+
executor.submit(
211+
self.lambda_client.invoke,
212+
FunctionName=self.lambda_name,
213+
InvocationType="RequestResponse",
214+
Payload=json.dumps(payload),
215+
)
216+
)
217+
218+
done, not_done = wait(futures)
219+
logger.info("Done: %s Not Done: %s", done, not_done)
220+
for f in done:
221+
response = f.result()
222+
output = json.loads(response["Payload"].read())
223+
224+
logger.info(
225+
f"Ingested task; request id {response['ResponseMetadata']['RequestId']}, "
226+
f"rows written: {output['written_rows']}"
227+
)
228+
229+
for f in not_done:
230+
response = f.result()
231+
logger.error(f"Ingestion failed: {response}")
232+
233+
return LambdaMaterializationJob(
234+
job_id=job_id,
235+
status=MaterializationJobStatus.SUCCEEDED
236+
if not not_done
237+
else MaterializationJobStatus.ERROR,
238+
)

sdk/python/feast/infra/online_stores/dynamodb.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,8 @@ def online_read(
229229
break
230230
batch_entity_ids = {
231231
table_instance.name: {
232-
"Keys": [{"entity_id": entity_id} for entity_id in batch]
232+
"Keys": [{"entity_id": entity_id} for entity_id in batch],
233+
"ConsistentRead": True,
233234
}
234235
}
235236
with tracing_span(name="remote_call"):

0 commit comments

Comments
 (0)