18
18
mod bitmap;
19
19
pub use bitmap:: Bitmap ;
20
20
21
+ use crate :: encoding:: Encoding ;
21
22
use crate :: error:: { FlockError , Result } ;
22
- use crate :: runtime:: payload:: Payload ;
23
+ use crate :: runtime:: payload:: { DataFrame , Payload } ;
24
+ use crate :: transmute:: * ;
23
25
use datafusion:: arrow:: datatypes:: SchemaRef ;
24
26
use datafusion:: arrow:: record_batch:: RecordBatch ;
27
+ use datafusion:: arrow_flight:: utils:: flight_data_to_arrow_batch;
28
+ use datafusion:: arrow_flight:: FlightData ;
25
29
use hashbrown:: HashMap ;
30
+ use rayon:: prelude:: * ;
26
31
use std:: ops:: { Deref , DerefMut } ;
32
+ use tokio:: task:: JoinHandle ;
27
33
28
34
type QueryId = String ;
29
35
type ShuffleId = usize ;
@@ -61,29 +67,38 @@ pub struct Arena(HashMap<WindowId, WindowSession>);
61
67
pub struct WindowSession {
62
68
/// The number of data fragments in the window.
63
69
/// [`WindowSession::size`] equals to [`Uuid::seq_len`].
64
- pub size : usize ,
65
- /// Aggregate record batches for the first relation.
66
- pub r1_records : Vec < Vec < RecordBatch > > ,
67
- /// Aggregate record batches for the second relation.
68
- pub r2_records : Vec < Vec < RecordBatch > > ,
70
+ pub size : usize ,
71
+ /// Aggregate the encoded data frames for the first relation.
72
+ /// https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/
73
+ pub r1_flight_data : Vec < Vec < DataFrame > > ,
74
+ /// The schema of the first relation.
75
+ pub r1_schema : Vec < u8 > ,
76
+ /// Aggregate the encoded data frames for the second relation.
77
+ /// https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/
78
+ pub r2_flight_data : Vec < Vec < DataFrame > > ,
79
+ /// The schema of the second relation.
80
+ pub r2_schema : Vec < u8 > ,
69
81
/// Bitmap indicating the data existence in the window.
70
- pub bitmap : Bitmap ,
82
+ pub bitmap : Bitmap ,
83
+ /// The compression method.
84
+ pub encoding : Encoding ,
71
85
}
72
86
73
87
impl WindowSession {
74
88
/// Return the schema of data fragments in the temporal window.
75
89
pub fn schema ( & self ) -> Result < ( SchemaRef , Option < SchemaRef > ) > {
76
- if self . r1_records . is_empty ( ) || self . r1_records [ 0 ] . is_empty ( ) {
90
+ if self . r1_schema . is_empty ( ) {
77
91
return Err ( FlockError :: Internal (
78
92
"Record batches are empty." . to_string ( ) ,
79
93
) ) ;
80
94
}
81
- if !self . r2_records . is_empty ( ) && !self . r2_records [ 0 ] . is_empty ( ) {
82
- Ok ( ( self . r1_records [ 0 ] [ 0 ] . schema ( ) , None ) )
95
+
96
+ if self . r2_schema . is_empty ( ) {
97
+ Ok ( ( schema_from_bytes ( & self . r1_schema ) ?, None ) )
83
98
} else {
84
99
Ok ( (
85
- self . r1_records [ 0 ] [ 0 ] . schema ( ) ,
86
- Some ( self . r2_records [ 0 ] [ 0 ] . schema ( ) ) ,
100
+ schema_from_bytes ( & self . r1_schema ) ? ,
101
+ Some ( schema_from_bytes ( & self . r2_schema ) ? ) ,
87
102
) )
88
103
}
89
104
}
@@ -95,12 +110,59 @@ impl Arena {
95
110
Arena ( HashMap :: < WindowId , WindowSession > :: new ( ) )
96
111
}
97
112
98
- /// Get the data fragments in the temporal window via the key.
99
- pub fn take_batches ( & mut self , window_id : & WindowId ) -> Vec < Vec < Vec < RecordBatch > > > {
113
+ /// Take a window from the arena.
114
+ pub async fn take ( & mut self , window_id : & WindowId ) -> Result < Vec < Vec < Vec < RecordBatch > > > > {
115
+ let to_batches = |df : Vec < DataFrame > , schema : SchemaRef | -> Vec < RecordBatch > {
116
+ df. into_par_iter ( )
117
+ . map ( |d| {
118
+ flight_data_to_arrow_batch (
119
+ & FlightData {
120
+ data_body : d. body ,
121
+ data_header : d. header ,
122
+ app_metadata : vec ! [ ] ,
123
+ flight_descriptor : None ,
124
+ } ,
125
+ schema. clone ( ) ,
126
+ & [ ] ,
127
+ )
128
+ . unwrap ( )
129
+ } )
130
+ . collect ( )
131
+ } ;
132
+
100
133
if let Some ( window) = ( * self ) . remove ( window_id) {
101
- vec ! [ window. r1_records, window. r2_records]
134
+ let ( schema1, schema2) = window. schema ( ) ?;
135
+
136
+ let mut tasks: Vec < JoinHandle < Vec < Vec < RecordBatch > > > > = vec ! [ ] ;
137
+
138
+ let encoding = window. encoding . clone ( ) ;
139
+ tasks. push ( tokio:: spawn ( async move {
140
+ window
141
+ . r1_flight_data
142
+ . into_par_iter ( )
143
+ . map ( |d| to_batches ( unmarshal ( d, encoding. clone ( ) ) , schema1. clone ( ) ) )
144
+ . collect ( )
145
+ } ) ) ;
146
+
147
+ if schema2. is_some ( ) {
148
+ let encoding = window. encoding . clone ( ) ;
149
+ let schema2 = schema2. unwrap ( ) ;
150
+ tasks. push ( tokio:: spawn ( async move {
151
+ window
152
+ . r2_flight_data
153
+ . into_par_iter ( )
154
+ . map ( |d| to_batches ( unmarshal ( d, encoding. clone ( ) ) , schema2. clone ( ) ) )
155
+ . collect ( )
156
+ } ) ) ;
157
+ }
158
+
159
+ Ok ( futures:: future:: join_all ( tasks)
160
+ . await
161
+ . into_iter ( )
162
+ . map ( |r| r. unwrap ( ) )
163
+ . collect ( ) )
102
164
} else {
103
- vec ! [ vec![ ] , vec![ ] ]
165
+ Ok ( vec ! [ vec![ ] , vec![ ] ] )
104
166
}
105
167
}
106
168
@@ -112,7 +174,7 @@ impl Arena {
112
174
/// Return true if the temporal window is empty.
113
175
pub fn is_complete ( & self , window_id : & WindowId ) -> bool {
114
176
self . get ( window_id)
115
- . map ( |window| window. size == window. r1_records . len ( ) )
177
+ . map ( |window| window. size == window. r1_flight_data . len ( ) )
116
178
. unwrap_or ( false )
117
179
}
118
180
@@ -132,12 +194,11 @@ impl Arena {
132
194
Some ( window) => {
133
195
assert ! ( uuid. seq_len == window. size) ;
134
196
if !window. bitmap . is_set ( uuid. seq_num ) {
135
- let ( r1, r2) = payload. to_record_batch ( ) ;
136
- window. r1_records . push ( r1) ;
137
- window. r2_records . push ( r2) ;
138
- assert ! ( window. r1_records. len( ) == window. r2_records. len( ) ) ;
197
+ window. r1_flight_data . push ( payload. data ) ;
198
+ window. r2_flight_data . push ( payload. data2 ) ;
199
+ assert ! ( window. r1_flight_data. len( ) == window. r2_flight_data. len( ) ) ;
139
200
window. bitmap . set ( uuid. seq_num ) ;
140
- if window. size == window. r1_records . len ( ) {
201
+ if window. size == window. r1_flight_data . len ( ) {
141
202
HashAggregateStatus :: Ready
142
203
} else {
143
204
HashAggregateStatus :: NotReady
@@ -147,12 +208,14 @@ impl Arena {
147
208
}
148
209
}
149
210
None => {
150
- let ( r1, r2) = payload. to_record_batch ( ) ;
151
211
let mut window = WindowSession {
152
- size : uuid. seq_len ,
153
- r1_records : vec ! [ r1] ,
154
- r2_records : vec ! [ r2] ,
155
- bitmap : Bitmap :: new ( uuid. seq_len + 1 ) , // Starts from 1.
212
+ size : uuid. seq_len ,
213
+ r1_flight_data : vec ! [ payload. data] ,
214
+ r2_flight_data : vec ! [ payload. data2] ,
215
+ r1_schema : payload. schema ,
216
+ r2_schema : payload. schema2 ,
217
+ bitmap : Bitmap :: new ( uuid. seq_len + 1 ) , // Starts from 1.
218
+ encoding : payload. encoding ,
156
219
} ;
157
220
// SEQ_NUM is used to indicate the data existence in the window via bitmap.
158
221
window. bitmap . set ( uuid. seq_num ) ;
@@ -243,12 +306,12 @@ mod tests {
243
306
244
307
if let Some ( window) = ( * arena) . get ( & window_id) {
245
308
assert_eq ! ( 8 , window. size) ;
246
- assert_eq ! ( 8 , window. r1_records . len( ) ) ;
309
+ assert_eq ! ( 8 , window. r1_flight_data . len( ) ) ;
247
310
( 0 ..8 ) . for_each ( |i| assert ! ( window. bitmap. is_set( i + 1 ) ) ) ;
248
311
}
249
312
250
- assert_eq ! ( 8 , arena. take_batches ( & window_id) [ 0 ] . len( ) ) ;
251
- assert_eq ! ( 0 , arena. take_batches ( & ( "no exists" . to_owned( ) , 0 ) ) [ 0 ] . len( ) ) ;
313
+ assert_eq ! ( 8 , arena. take ( & window_id) . await ? [ 0 ] . len( ) ) ;
314
+ assert_eq ! ( 0 , arena. take ( & ( "no exists" . to_owned( ) , 0 ) ) . await ? [ 0 ] . len( ) ) ;
252
315
253
316
Ok ( ( ) )
254
317
}
0 commit comments