Skip to content

Commit 122d2bb

Browse files
remo-labremo-lab
authored andcommitted
feat: Enhanced Health Checks with Dependency Status
- Add /readyz endpoint for Kubernetes readiness probes - Add /healthz/detailed endpoint for component-level status - Implement GossipChecker, LedgerChecker, and OrdererChecker - Support OK/DEGRADED/UNAVAILABLE status semantics - DEGRADED components don't fail readiness (only UNAVAILABLE does) - Safe defaults: minPeers=0, failOnLag=false, orderer disabled - Add comprehensive unit and integration tests - Update configuration and documentation
1 parent be1959b commit 122d2bb

16 files changed

+1570
-310
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
Copyright IBM Corp All Rights Reserved.
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
package operations
8+
9+
import (
10+
"context"
11+
"encoding/json"
12+
"net/http"
13+
"time"
14+
15+
libhealthz "github.com/hyperledger/fabric-lib-go/healthz"
16+
"github.com/hyperledger/fabric/core/operations/healthz"
17+
)
18+
19+
// DetailedHealthHandler handles /healthz/detailed. Access should be restricted
20+
// via TLS/client authentication.
21+
type DetailedHealthHandler struct {
22+
readinessHandler *healthz.ReadinessHandler
23+
healthHandler HealthHandler
24+
timeout time.Duration
25+
}
26+
27+
type HealthHandler interface {
28+
RunChecks(context.Context) []libhealthz.FailedCheck
29+
}
30+
31+
func NewDetailedHealthHandler(readinessHandler *healthz.ReadinessHandler, healthHandler HealthHandler, timeout time.Duration) *DetailedHealthHandler {
32+
return &DetailedHealthHandler{
33+
readinessHandler: readinessHandler,
34+
healthHandler: healthHandler,
35+
timeout: timeout,
36+
}
37+
}
38+
39+
func (h *DetailedHealthHandler) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
40+
if req.Method != "GET" {
41+
rw.WriteHeader(http.StatusMethodNotAllowed)
42+
return
43+
}
44+
45+
ctx, cancel := context.WithTimeout(req.Context(), h.timeout)
46+
defer cancel()
47+
48+
detailedStatus := h.readinessHandler.GetDetailedStatus(ctx)
49+
50+
livenessChecks := h.healthHandler.RunChecks(ctx)
51+
if len(livenessChecks) > 0 {
52+
for _, check := range livenessChecks {
53+
detailedStatus.FailedChecks = append(detailedStatus.FailedChecks, healthz.FailedCheck{
54+
Component: check.Component,
55+
Reason: check.Reason,
56+
})
57+
if _, exists := detailedStatus.Components[check.Component]; !exists {
58+
detailedStatus.Components[check.Component] = healthz.ComponentStatus{
59+
Status: healthz.StatusUnavailable,
60+
Message: check.Reason,
61+
}
62+
}
63+
}
64+
if detailedStatus.Status == healthz.StatusOK {
65+
detailedStatus.Status = healthz.StatusUnavailable
66+
}
67+
}
68+
69+
rw.Header().Set("Content-Type", "application/json")
70+
71+
var statusCode int
72+
switch detailedStatus.Status {
73+
case healthz.StatusOK:
74+
statusCode = http.StatusOK
75+
case healthz.StatusDegraded:
76+
statusCode = http.StatusOK
77+
default:
78+
statusCode = http.StatusServiceUnavailable
79+
}
80+
81+
resp, err := json.Marshal(detailedStatus)
82+
if err != nil {
83+
rw.WriteHeader(http.StatusInternalServerError)
84+
return
85+
}
86+
87+
rw.WriteHeader(statusCode)
88+
rw.Write(resp)
89+
}
90+
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
Copyright IBM Corp. All Rights Reserved.
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
package healthcheckers
8+
9+
import (
10+
"context"
11+
"fmt"
12+
"time"
13+
14+
"github.com/hyperledger/fabric/core/operations/healthz"
15+
"github.com/hyperledger/fabric/gossip/service"
16+
)
17+
18+
// GossipChecker verifies gossip service connectivity. When minPeers = 0 (default),
19+
// only checks that gossip is initialized, avoiding false negatives in dev setups.
20+
// When minPeers > 0, enforces minimum peer count. DEGRADED at minimum is informational.
21+
type GossipChecker struct {
22+
gossipService *service.GossipService
23+
minPeers int
24+
timeout time.Duration
25+
}
26+
27+
// NewGossipChecker creates a new GossipChecker. If minPeers is 0, only verifies
28+
// gossip is initialized.
29+
func NewGossipChecker(gossipService *service.GossipService, minPeers int, timeout time.Duration) *GossipChecker {
30+
return &GossipChecker{
31+
gossipService: gossipService,
32+
minPeers: minPeers,
33+
timeout: timeout,
34+
}
35+
}
36+
37+
func (g *GossipChecker) ReadinessCheck(ctx context.Context) error {
38+
if g.gossipService == nil {
39+
return fmt.Errorf("gossip service not initialized")
40+
}
41+
42+
peers := g.gossipService.Peers()
43+
connectedCount := len(peers)
44+
45+
if g.minPeers > 0 && connectedCount < g.minPeers {
46+
return fmt.Errorf("insufficient peers connected: %d (minimum: %d)", connectedCount, g.minPeers)
47+
}
48+
49+
return nil
50+
}
51+
52+
func (g *GossipChecker) GetStatus() healthz.ComponentStatus {
53+
if g.gossipService == nil {
54+
return healthz.ComponentStatus{
55+
Status: healthz.StatusUnavailable,
56+
Message: "Gossip service not initialized",
57+
}
58+
}
59+
60+
peers := g.gossipService.Peers()
61+
connectedCount := len(peers)
62+
63+
status := healthz.StatusOK
64+
message := fmt.Sprintf("Connected to %d peers", connectedCount)
65+
66+
if g.minPeers > 0 && connectedCount < g.minPeers {
67+
status = healthz.StatusUnavailable
68+
message = fmt.Sprintf("Insufficient peers: %d (minimum: %d)", connectedCount, g.minPeers)
69+
} else if g.minPeers > 0 && connectedCount == g.minPeers {
70+
status = healthz.StatusDegraded
71+
message = fmt.Sprintf("Connected to minimum required peers: %d", connectedCount)
72+
}
73+
74+
details := map[string]interface{}{
75+
"connected_peers": connectedCount,
76+
"min_peers": g.minPeers,
77+
}
78+
79+
return healthz.ComponentStatus{
80+
Status: status,
81+
Message: message,
82+
Details: details,
83+
}
84+
}
85+
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
Copyright IBM Corp. All Rights Reserved.
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
package healthcheckers
8+
9+
import (
10+
"context"
11+
"testing"
12+
13+
"github.com/hyperledger/fabric/core/operations/healthz"
14+
"github.com/hyperledger/fabric/gossip/service"
15+
"github.com/stretchr/testify/assert"
16+
"github.com/stretchr/testify/require"
17+
)
18+
19+
func TestGossipChecker_ReadinessCheck(t *testing.T) {
20+
tests := []struct {
21+
name string
22+
gossip *service.GossipService
23+
minPeers int
24+
expectError bool
25+
errorMsg string
26+
}{
27+
{
28+
name: "nil gossip service",
29+
gossip: nil,
30+
minPeers: 0,
31+
expectError: true,
32+
errorMsg: "gossip service not initialized",
33+
},
34+
}
35+
36+
for _, tt := range tests {
37+
t.Run(tt.name, func(t *testing.T) {
38+
checker := &GossipChecker{
39+
gossipService: tt.gossip,
40+
minPeers: tt.minPeers,
41+
timeout: 5,
42+
}
43+
44+
err := checker.ReadinessCheck(context.Background())
45+
46+
if tt.expectError {
47+
require.Error(t, err)
48+
if tt.errorMsg != "" {
49+
assert.Contains(t, err.Error(), tt.errorMsg)
50+
}
51+
} else {
52+
require.NoError(t, err)
53+
}
54+
})
55+
}
56+
}
57+
58+
func TestGossipChecker_GetStatus(t *testing.T) {
59+
tests := []struct {
60+
name string
61+
gossip *service.GossipService
62+
minPeers int
63+
expectedStatus string
64+
}{
65+
{
66+
name: "nil gossip service",
67+
gossip: nil,
68+
minPeers: 0,
69+
expectedStatus: healthz.StatusUnavailable,
70+
},
71+
}
72+
73+
for _, tt := range tests {
74+
t.Run(tt.name, func(t *testing.T) {
75+
checker := &GossipChecker{
76+
gossipService: tt.gossip,
77+
minPeers: tt.minPeers,
78+
timeout: 5,
79+
}
80+
81+
status := checker.GetStatus()
82+
assert.Equal(t, tt.expectedStatus, status.Status)
83+
assert.NotEmpty(t, status.Message)
84+
})
85+
}
86+
}
87+

0 commit comments

Comments
 (0)