Skip to content

Commit 3256952

Browse files
authored
feat: Update stream fcos to have watermark and sliding interval (#2765)
* Add sliding window to aggregations Signed-off-by: Kevin Zhang <[email protected]> * Fix Signed-off-by: Kevin Zhang <[email protected]> * update apis Signed-off-by: Kevin Zhang <[email protected]> * Lint Signed-off-by: Kevin Zhang <[email protected]> * Fix lint Signed-off-by: Kevin Zhang <[email protected]> * Fix Signed-off-by: Kevin Zhang <[email protected]> * Fix Signed-off-by: Kevin Zhang <[email protected]>
1 parent d25e8d4 commit 3256952

File tree

6 files changed

+85
-3
lines changed

6 files changed

+85
-3
lines changed

protos/feast/core/Aggregation.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ message Aggregation {
1111
string column = 1;
1212
string function = 2;
1313
google.protobuf.Duration time_window = 3;
14+
google.protobuf.Duration slide_interval = 4;
1415
}

protos/feast/core/DataSource.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ option go_package = "github.com/feast-dev/feast/go/protos/feast/core";
2222
option java_outer_classname = "DataSourceProto";
2323
option java_package = "feast.proto.core";
2424

25+
import "google/protobuf/duration.proto";
2526
import "feast/core/DataFormat.proto";
2627
import "feast/types/Value.proto";
2728
import "feast/core/Feature.proto";
@@ -135,6 +136,7 @@ message DataSource {
135136
// Defines the stream data format encoding feature/entity data in Kafka messages.
136137
StreamFormat message_format = 3;
137138

139+
google.protobuf.Duration watermark = 4;
138140
}
139141

140142
// Defines options for DataSource that sources features from Kinesis records.

sdk/python/feast/aggregation.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,45 @@ class Aggregation:
1414
column: str # Column name of the feature we are aggregating.
1515
function: str # Provided built in aggregations sum, max, min, count mean
1616
time_window: timedelta # The time window for this aggregation.
17+
slide_interval: timedelta # The sliding window for these aggregations
1718
"""
1819

1920
column: str
2021
function: str
2122
time_window: Optional[timedelta]
23+
slide_interval: Optional[timedelta]
2224

2325
def __init__(
2426
self,
2527
column: Optional[str] = "",
2628
function: Optional[str] = "",
2729
time_window: Optional[timedelta] = None,
30+
slide_interval: Optional[timedelta] = None,
2831
):
2932
self.column = column or ""
3033
self.function = function or ""
3134
self.time_window = time_window
35+
if not slide_interval:
36+
self.slide_interval = self.time_window
37+
else:
38+
self.slide_interval = slide_interval
3239

3340
def to_proto(self) -> AggregationProto:
3441
window_duration = None
3542
if self.time_window is not None:
3643
window_duration = Duration()
3744
window_duration.FromTimedelta(self.time_window)
3845

46+
slide_interval_duration = None
47+
if self.slide_interval is not None:
48+
slide_interval_duration = Duration()
49+
slide_interval_duration.FromTimedelta(self.slide_interval)
50+
3951
return AggregationProto(
40-
column=self.column, function=self.function, time_window=window_duration
52+
column=self.column,
53+
function=self.function,
54+
time_window=window_duration,
55+
slide_interval=slide_interval_duration,
4156
)
4257

4358
@classmethod
@@ -48,10 +63,16 @@ def from_proto(cls, agg_proto: AggregationProto):
4863
else agg_proto.time_window.ToTimedelta()
4964
)
5065

66+
slide_interval = (
67+
timedelta(days=0)
68+
if agg_proto.slide_interval.ToNanoseconds() == 0
69+
else agg_proto.slide_interval.ToTimedelta()
70+
)
5171
aggregation = cls(
5272
column=agg_proto.column,
5373
function=agg_proto.function,
5474
time_window=time_window,
75+
slide_interval=slide_interval,
5576
)
5677
return aggregation
5778

@@ -63,6 +84,7 @@ def __eq__(self, other):
6384
self.column != other.column
6485
or self.function != other.function
6586
or self.time_window != other.time_window
87+
or self.slide_interval != other.slide_interval
6688
):
6789
return False
6890

sdk/python/feast/data_source.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
import enum
1616
import warnings
1717
from abc import ABC, abstractmethod
18+
from datetime import timedelta
1819
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
1920

21+
from google.protobuf.duration_pb2 import Duration
2022
from google.protobuf.json_format import MessageToJson
2123

2224
from feast import type_map
@@ -47,11 +49,16 @@ class KafkaOptions:
4749
"""
4850

4951
def __init__(
50-
self, bootstrap_servers: str, message_format: StreamFormat, topic: str,
52+
self,
53+
bootstrap_servers: str,
54+
message_format: StreamFormat,
55+
topic: str,
56+
watermark: Optional[timedelta] = None,
5157
):
5258
self.bootstrap_servers = bootstrap_servers
5359
self.message_format = message_format
5460
self.topic = topic
61+
self.watermark = watermark or None
5562

5663
@classmethod
5764
def from_proto(cls, kafka_options_proto: DataSourceProto.KafkaOptions):
@@ -64,11 +71,18 @@ def from_proto(cls, kafka_options_proto: DataSourceProto.KafkaOptions):
6471
Returns:
6572
Returns a BigQueryOptions object based on the kafka_options protobuf
6673
"""
67-
74+
watermark = None
75+
if kafka_options_proto.HasField("watermark"):
76+
watermark = (
77+
timedelta(days=0)
78+
if kafka_options_proto.watermark.ToNanoseconds() == 0
79+
else kafka_options_proto.watermark.ToTimedelta()
80+
)
6881
kafka_options = cls(
6982
bootstrap_servers=kafka_options_proto.bootstrap_servers,
7083
message_format=StreamFormat.from_proto(kafka_options_proto.message_format),
7184
topic=kafka_options_proto.topic,
85+
watermark=watermark,
7286
)
7387

7488
return kafka_options
@@ -80,11 +94,16 @@ def to_proto(self) -> DataSourceProto.KafkaOptions:
8094
Returns:
8195
KafkaOptionsProto protobuf
8296
"""
97+
watermark_duration = None
98+
if self.watermark is not None:
99+
watermark_duration = Duration()
100+
watermark_duration.FromTimedelta(self.watermark)
83101

84102
kafka_options_proto = DataSourceProto.KafkaOptions(
85103
bootstrap_servers=self.bootstrap_servers,
86104
message_format=self.message_format.to_proto(),
87105
topic=self.topic,
106+
watermark=watermark_duration,
88107
)
89108

90109
return kafka_options_proto
@@ -369,7 +388,32 @@ def __init__(
369388
owner: Optional[str] = "",
370389
timestamp_field: Optional[str] = "",
371390
batch_source: Optional[DataSource] = None,
391+
watermark: Optional[timedelta] = None,
372392
):
393+
"""
394+
Creates a KafkaSource stream source object.
395+
Args:
396+
name: str. Name of data source, which should be unique within a project
397+
event_timestamp_column (optional): str. (Deprecated) Event timestamp column used for point in time
398+
joins of feature values.
399+
bootstrap_servers: str. The servers of the kafka broker in the form "localhost:9092".
400+
message_format: StreamFormat. StreamFormat of serialized messages.
401+
topic: str. The name of the topic to read from in the kafka source.
402+
created_timestamp_column (optional): str. Timestamp column indicating when the row
403+
was created, used for deduplicating rows.
404+
field_mapping (optional): dict(str, str). A dictionary mapping of column names in this data
405+
source to feature names in a feature table or view. Only used for feature
406+
columns, not entity or timestamp columns.
407+
date_partition_column (optional): str. Timestamp column used for partitioning.
408+
description (optional): str. A human-readable description.
409+
tags (optional): dict(str, str). A dictionary of key-value pairs to store arbitrary metadata.
410+
owner (optional): str. The owner of the data source, typically the email of the primary
411+
maintainer.
412+
timestamp_field (optional): str. Event timestamp field used for point
413+
in time joins of feature values.
414+
batch_source: DataSource. The datasource that acts as a batch source.
415+
watermark: timedelta. The watermark for stream data. Specifically how late stream data can arrive without being discarded.
416+
"""
373417
positional_attributes = [
374418
"name",
375419
"event_timestamp_column",
@@ -425,10 +469,12 @@ def __init__(
425469
timestamp_field=timestamp_field,
426470
)
427471
self.batch_source = batch_source
472+
428473
self.kafka_options = KafkaOptions(
429474
bootstrap_servers=_bootstrap_servers,
430475
message_format=_message_format,
431476
topic=_topic,
477+
watermark=watermark,
432478
)
433479

434480
def __eq__(self, other):
@@ -445,6 +491,7 @@ def __eq__(self, other):
445491
!= other.kafka_options.bootstrap_servers
446492
or self.kafka_options.message_format != other.kafka_options.message_format
447493
or self.kafka_options.topic != other.kafka_options.topic
494+
or self.kafka_options.watermark != other.kafka_options.watermark
448495
):
449496
return False
450497

@@ -455,6 +502,13 @@ def __hash__(self):
455502

456503
@staticmethod
457504
def from_proto(data_source: DataSourceProto):
505+
watermark = None
506+
if data_source.kafka_options.HasField("watermark"):
507+
watermark = (
508+
timedelta(days=0)
509+
if data_source.kafka_options.watermark.ToNanoseconds() == 0
510+
else data_source.kafka_options.watermark.ToTimedelta()
511+
)
458512
return KafkaSource(
459513
name=data_source.name,
460514
event_timestamp_column=data_source.timestamp_field,
@@ -463,6 +517,7 @@ def from_proto(data_source: DataSourceProto):
463517
message_format=StreamFormat.from_proto(
464518
data_source.kafka_options.message_format
465519
),
520+
watermark=watermark,
466521
topic=data_source.kafka_options.topic,
467522
created_timestamp_column=data_source.created_timestamp_column,
468523
timestamp_field=data_source.timestamp_field,

sdk/python/tests/integration/registration/test_registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ def simple_udf(x: int):
319319
message_format=AvroFormat(""),
320320
topic="topic",
321321
batch_source=FileSource(path="some path"),
322+
watermark=timedelta(days=1),
322323
)
323324

324325
sfv = StreamFeatureView(

sdk/python/tests/integration/registration/test_stream_feature_view_apply.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def test_apply_stream_feature_view(environment) -> None:
2727
message_format=AvroFormat(""),
2828
topic="topic",
2929
batch_source=FileSource(path="test_path", timestamp_field="event_timestamp"),
30+
watermark=timedelta(days=1),
3031
)
3132

3233
@stream_feature_view(

0 commit comments

Comments
 (0)