@@ -40,10 +40,10 @@ import (
40
40
)
41
41
42
42
var (
43
- reservationDelay = 2 * time .Second
44
- startingTimeout = 5 * time .Minute
45
- waitingTimeout = 30 * time .Second
46
- completedTimeout = 3 * 24 * time .Hour
43
+ reservationDelay = 2 * time .Second
44
+ startingTimeout = 5 * time .Minute
45
+ completingTimeout = 30 * time .Second
46
+ terminatedTimeout = 3 * 24 * time .Hour
47
47
defaultPlaceholderTimeout = 15 * time .Minute
48
48
)
49
49
@@ -146,8 +146,8 @@ func (sa *Application) IsRunning() bool {
146
146
return sa .stateMachine .Is (Running .String ())
147
147
}
148
148
149
- func (sa * Application ) IsWaiting () bool {
150
- return sa .stateMachine .Is (Waiting .String ())
149
+ func (sa * Application ) IsCompleting () bool {
150
+ return sa .stateMachine .Is (Completing .String ())
151
151
}
152
152
153
153
func (sa * Application ) IsCompleted () bool {
@@ -158,6 +158,10 @@ func (sa *Application) IsExpired() bool {
158
158
return sa .stateMachine .Is (Expired .String ())
159
159
}
160
160
161
+ func (sa * Application ) IsFailing () bool {
162
+ return sa .stateMachine .Is (Failing .String ())
163
+ }
164
+
161
165
func (sa * Application ) IsFailed () bool {
162
166
return sa .stateMachine .Is (Failed .String ())
163
167
}
@@ -208,13 +212,13 @@ func (sa *Application) setStateTimer(timeout time.Duration, currentState string,
208
212
func (sa * Application ) timeoutStateTimer (expectedState string , event applicationEvent ) func () {
209
213
return func () {
210
214
// make sure we are still in the right state
211
- // we could have been killed or something might have happened while waiting for a lock
215
+ // we could have been failed or something might have happened while waiting for a lock
212
216
if expectedState == sa .stateMachine .Current () {
213
217
log .Logger ().Debug ("Application state: auto progress" ,
214
218
zap .String ("applicationID" , sa .ApplicationID ),
215
219
zap .String ("state" , sa .stateMachine .Current ()))
216
- // if the app is waiting , but there are placeholders left, first do the cleanup
217
- if sa .IsWaiting () && ! resources .IsZero (sa .GetPlaceholderResource ()) {
220
+ // if the app is completing , but there are placeholders left, first do the cleanup
221
+ if sa .IsCompleting () && ! resources .IsZero (sa .GetPlaceholderResource ()) {
218
222
sa .notifyRMAllocationReleased (sa .rmID , sa .getPlaceholderAllocations (), si .TerminationType_TIMEOUT , "releasing placeholders on app complete" )
219
223
sa .clearStateTimer ()
220
224
} else {
@@ -239,9 +243,6 @@ func (sa *Application) clearStateTimer() {
239
243
zap .String ("state" , sa .stateMachine .Current ()))
240
244
}
241
245
242
- func (sa * Application ) isWaitingStateTimedOut () bool {
243
- return sa .IsWaiting () && sa .stateTimer == nil
244
- }
245
246
func (sa * Application ) initPlaceholderTimer () {
246
247
if sa .placeholderTimer != nil || ! sa .IsAccepted () || sa .execTimeout <= 0 {
247
248
return
@@ -271,16 +272,17 @@ func (sa *Application) timeoutPlaceholderProcessing() {
271
272
}
272
273
// Case 2: in every other case fail the application, and notify the context about the expired placeholder asks
273
274
default :
274
- sa .notifyRMAllocationAskReleased (sa .rmID , sa .getAllRequests (), si .TerminationType_TIMEOUT , "releasing placeholders on placeholder timeout" )
275
- sa .removeAsksInternal ("" )
275
+ // change the status of the app to Failing. Once all the placeholders are cleaned up, if will be changed to Failed
276
276
if err := sa .HandleApplicationEvent (FailApplication ); err != nil {
277
277
log .Logger ().Debug ("Application state change failed when placeholder timed out" ,
278
278
zap .String ("AppID" , sa .ApplicationID ),
279
279
zap .String ("currentState" , sa .CurrentState ()),
280
280
zap .Error (err ))
281
281
}
282
+ sa .notifyRMAllocationAskReleased (sa .rmID , sa .getAllRequests (), si .TerminationType_TIMEOUT , "releasing placeholders asks on placeholder timeout" )
283
+ sa .removeAsksInternal ("" )
282
284
}
283
- sa .notifyRMAllocationReleased (sa .rmID , sa .getPlaceholderAllocations (), si .TerminationType_TIMEOUT , "releasing placeholders on placeholder timeout" )
285
+ sa .notifyRMAllocationReleased (sa .rmID , sa .getPlaceholderAllocations (), si .TerminationType_TIMEOUT , "releasing allocated placeholders on placeholder timeout" )
284
286
sa .clearPlaceholderTimer ()
285
287
}
286
288
@@ -397,12 +399,11 @@ func (sa *Application) removeAsksInternal(allocKey string) int {
397
399
// Check if we need to change state based on the ask removal:
398
400
// 1) if pending is zero (no more asks left)
399
401
// 2) if confirmed allocations is zero (no real tasks running)
400
- // 3) if placeholder allocations is zero (no placeholders running)
401
- // Change the state to waiting.
402
+ // Change the state to completing.
402
403
// When the resource trackers are zero we should not expect anything to come in later.
403
- if resources .IsZero (sa .pending ) && resources .IsZero (sa .allocatedResource ) {
404
- if err := sa .HandleApplicationEvent (WaitApplication ); err != nil {
405
- log .Logger ().Warn ("Application state not changed to Waiting while updating ask(s)" ,
404
+ if resources .IsZero (sa .pending ) && resources .IsZero (sa .allocatedResource ) && ! sa . IsFailing () {
405
+ if err := sa .HandleApplicationEvent (CompleteApplication ); err != nil {
406
+ log .Logger ().Warn ("Application state not changed to Completing while updating ask(s)" ,
406
407
zap .String ("currentState" , sa .CurrentState ()),
407
408
zap .Error (err ))
408
409
}
@@ -437,9 +438,9 @@ func (sa *Application) AddAllocationAsk(ask *AllocationAsk) error {
437
438
438
439
// Check if we need to change state based on the ask added, there are two cases:
439
440
// 1) first ask added on a new app: state is New
440
- // 2) all asks and allocation have been removed: state is Waiting
441
+ // 2) all asks and allocation have been removed: state is Completing
441
442
// Move the state and get it scheduling (again)
442
- if sa .stateMachine .Is (New .String ()) || sa .stateMachine .Is (Waiting .String ()) {
443
+ if sa .stateMachine .Is (New .String ()) || sa .stateMachine .Is (Completing .String ()) {
443
444
if err := sa .HandleApplicationEvent (RunApplication ); err != nil {
444
445
log .Logger ().Debug ("Application state change failed while adding new ask" ,
445
446
zap .String ("currentState" , sa .CurrentState ()),
@@ -1185,21 +1186,25 @@ func (sa *Application) removeAllocationInternal(uuid string) *Allocation {
1185
1186
// if all the placeholders are replaced, clear the placeholder timer
1186
1187
if resources .IsZero (sa .allocatedPlaceholder ) {
1187
1188
sa .clearPlaceholderTimer ()
1189
+ if (sa .IsCompleting () && sa .stateTimer == nil ) || sa .IsFailing () {
1190
+ event := CompleteApplication
1191
+ if sa .IsFailing () {
1192
+ event = FailApplication
1193
+ }
1194
+ if err := sa .HandleApplicationEvent (event ); err != nil {
1195
+ log .Logger ().Warn ("Application state not changed while removing a placeholder allocation" ,
1196
+ zap .String ("currentState" , sa .CurrentState ()),
1197
+ zap .String ("event" , event .String ()),
1198
+ zap .Error (err ))
1199
+ }
1200
+ }
1188
1201
}
1189
1202
} else {
1190
1203
sa .allocatedResource = resources .Sub (sa .allocatedResource , alloc .AllocatedResource )
1191
- }
1192
- // When the resource trackers are zero we should not expect anything to come in later.
1193
- if resources .IsZero (sa .pending ) && resources .IsZero (sa .allocatedResource ) {
1194
- if sa .isWaitingStateTimedOut () && resources .IsZero (sa .allocatedPlaceholder ) {
1204
+ // When the resource trackers are zero we should not expect anything to come in later.
1205
+ if resources .IsZero (sa .pending ) && resources .IsZero (sa .allocatedResource ) {
1195
1206
if err := sa .HandleApplicationEvent (CompleteApplication ); err != nil {
1196
- log .Logger ().Warn ("Application state not changed to Completed while removing some allocation(s)" ,
1197
- zap .String ("currentState" , sa .CurrentState ()),
1198
- zap .Error (err ))
1199
- }
1200
- } else {
1201
- if err := sa .HandleApplicationEvent (WaitApplication ); err != nil {
1202
- log .Logger ().Warn ("Application state not changed to Waiting while removing some allocation(s)" ,
1207
+ log .Logger ().Warn ("Application state not changed to Waiting while removing an allocation" ,
1203
1208
zap .String ("currentState" , sa .CurrentState ()),
1204
1209
zap .Error (err ))
1205
1210
}
@@ -1225,7 +1230,7 @@ func (sa *Application) RemoveAllAllocations() []*Allocation {
1225
1230
sa .allocations = make (map [string ]* Allocation )
1226
1231
// When the resource trackers are zero we should not expect anything to come in later.
1227
1232
if resources .IsZero (sa .pending ) {
1228
- if err := sa .HandleApplicationEvent (WaitApplication ); err != nil {
1233
+ if err := sa .HandleApplicationEvent (CompleteApplication ); err != nil {
1229
1234
log .Logger ().Warn ("Application state not changed to Waiting while removing all allocations" ,
1230
1235
zap .String ("currentState" , sa .CurrentState ()),
1231
1236
zap .Error (err ))
@@ -1303,3 +1308,14 @@ func (sa *Application) notifyRMAllocationAskReleased(rmID string, released []*Al
1303
1308
}
1304
1309
sa .rmEventHandler .HandleEvent (releaseEvent )
1305
1310
}
1311
+
1312
+ // Auto progress the application when it enters the Failing state if there is nothing to clean up.
1313
+ // Since this is called by the _locked_ state machine while processing an event we cannot call back
1314
+ // into the statemachine directly and we need a go routine to avoid a deadlock.
1315
+ func (sa * Application ) failAppIfPossible () {
1316
+ if resources .IsZero (sa .pending ) && resources .IsZero (sa .allocatedResource ) && resources .IsZero (sa .allocatedPlaceholder ) {
1317
+ // The event handling cannot fail
1318
+ //nolint: errcheck
1319
+ go sa .HandleApplicationEvent (FailApplication )
1320
+ }
1321
+ }
0 commit comments