Skip to content

Commit f1ab5e2

Browse files
remo-labremo-lab
authored andcommitted
feat: Enhanced Health Checks with Dependency Status
- Add /readyz endpoint for Kubernetes readiness probes - Add /healthz/detailed endpoint for component-level status - Implement GossipChecker, LedgerChecker, and OrdererChecker - Support OK/DEGRADED/UNAVAILABLE status semantics - DEGRADED components don't fail readiness (only UNAVAILABLE does) - Safe defaults: minPeers=0, failOnLag=false, orderer disabled - Add comprehensive unit and integration tests - Update configuration and documentation Signed-off-by: remo-lab <[email protected]>
1 parent a0f9d87 commit f1ab5e2

File tree

6 files changed

+433
-310
lines changed

6 files changed

+433
-310
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
Copyright IBM Corp All Rights Reserved.
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
package operations
8+
9+
import (
10+
"context"
11+
"encoding/json"
12+
"net/http"
13+
"time"
14+
15+
libhealthz "github.com/hyperledger/fabric-lib-go/healthz"
16+
"github.com/hyperledger/fabric/core/operations/healthz"
17+
)
18+
19+
// DetailedHealthHandler handles /healthz/detailed. Access should be restricted
20+
// via TLS/client authentication.
21+
type DetailedHealthHandler struct {
22+
readinessHandler *healthz.ReadinessHandler
23+
healthHandler HealthHandler
24+
timeout time.Duration
25+
}
26+
27+
type HealthHandler interface {
28+
RunChecks(context.Context) []libhealthz.FailedCheck
29+
}
30+
31+
func NewDetailedHealthHandler(readinessHandler *healthz.ReadinessHandler, healthHandler HealthHandler, timeout time.Duration) *DetailedHealthHandler {
32+
return &DetailedHealthHandler{
33+
readinessHandler: readinessHandler,
34+
healthHandler: healthHandler,
35+
timeout: timeout,
36+
}
37+
}
38+
39+
func (h *DetailedHealthHandler) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
40+
if req.Method != "GET" {
41+
rw.WriteHeader(http.StatusMethodNotAllowed)
42+
return
43+
}
44+
45+
ctx, cancel := context.WithTimeout(req.Context(), h.timeout)
46+
defer cancel()
47+
48+
detailedStatus := h.readinessHandler.GetDetailedStatus(ctx)
49+
50+
livenessChecks := h.healthHandler.RunChecks(ctx)
51+
if len(livenessChecks) > 0 {
52+
for _, check := range livenessChecks {
53+
detailedStatus.FailedChecks = append(detailedStatus.FailedChecks, healthz.FailedCheck{
54+
Component: check.Component,
55+
Reason: check.Reason,
56+
})
57+
if _, exists := detailedStatus.Components[check.Component]; !exists {
58+
detailedStatus.Components[check.Component] = healthz.ComponentStatus{
59+
Status: healthz.StatusUnavailable,
60+
Message: check.Reason,
61+
}
62+
}
63+
}
64+
if detailedStatus.Status == healthz.StatusOK {
65+
detailedStatus.Status = healthz.StatusUnavailable
66+
}
67+
}
68+
69+
rw.Header().Set("Content-Type", "application/json")
70+
71+
var statusCode int
72+
switch detailedStatus.Status {
73+
case healthz.StatusOK:
74+
statusCode = http.StatusOK
75+
case healthz.StatusDegraded:
76+
statusCode = http.StatusOK
77+
default:
78+
statusCode = http.StatusServiceUnavailable
79+
}
80+
81+
resp, err := json.Marshal(detailedStatus)
82+
if err != nil {
83+
rw.WriteHeader(http.StatusInternalServerError)
84+
return
85+
}
86+
87+
rw.WriteHeader(statusCode)
88+
rw.Write(resp)
89+
}
90+

core/operations/system.go

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package operations
88

99
import (
1010
"context"
11+
"fmt"
1112
"net"
1213
"strings"
1314
"time"
@@ -20,9 +21,10 @@ import (
2021
"github.com/hyperledger/fabric-lib-go/common/metrics/prometheus"
2122
"github.com/hyperledger/fabric-lib-go/common/metrics/statsd"
2223
"github.com/hyperledger/fabric-lib-go/common/metrics/statsd/goruntime"
23-
"github.com/hyperledger/fabric-lib-go/healthz"
24+
libhealthz "github.com/hyperledger/fabric-lib-go/healthz"
2425
"github.com/hyperledger/fabric/common/fabhttp"
2526
"github.com/hyperledger/fabric/common/metadata"
27+
"github.com/hyperledger/fabric/core/operations/healthz"
2628
"github.com/prometheus/client_golang/prometheus/promhttp"
2729
)
2830

@@ -55,13 +57,14 @@ type System struct {
5557
*fabhttp.Server
5658
metrics.Provider
5759

58-
logger Logger
59-
healthHandler *healthz.HealthHandler
60-
options Options
61-
statsd *kitstatsd.Statsd
62-
collectorTicker *time.Ticker
63-
sendTicker *time.Ticker
64-
versionGauge metrics.Gauge
60+
logger Logger
61+
healthHandler *libhealthz.HealthHandler
62+
readinessHandler *healthz.ReadinessHandler
63+
options Options
64+
statsd *kitstatsd.Statsd
65+
collectorTicker *time.Ticker
66+
sendTicker *time.Ticker
67+
versionGauge metrics.Gauge
6568
}
6669

6770
func NewSystem(o Options) *System {
@@ -109,10 +112,21 @@ func (s *System) Stop() error {
109112
return s.Server.Stop()
110113
}
111114

112-
func (s *System) RegisterChecker(component string, checker healthz.HealthChecker) error {
115+
func (s *System) RegisterChecker(component string, checker libhealthz.HealthChecker) error {
113116
return s.healthHandler.RegisterChecker(component, checker)
114117
}
115118

119+
// RegisterReadinessChecker registers a ReadinessChecker for a named component.
120+
// Readiness checkers verify that dependencies are available and the service
121+
// is ready to accept traffic. These checks are separate from liveness checks
122+
// and are used by Kubernetes readiness probes.
123+
func (s *System) RegisterReadinessChecker(component string, checker healthz.ReadinessChecker) error {
124+
if s.readinessHandler == nil {
125+
return fmt.Errorf("readiness handler not initialized")
126+
}
127+
return s.readinessHandler.RegisterChecker(component, checker)
128+
}
129+
116130
func (s *System) initializeMetricsProvider() error {
117131
m := s.options.Metrics
118132
providerType := m.Provider
@@ -180,7 +194,7 @@ func (s *System) initializeLoggingHandler() {
180194
}
181195

182196
func (s *System) initializeHealthCheckHandler() {
183-
s.healthHandler = healthz.NewHealthHandler()
197+
s.healthHandler = libhealthz.NewHealthHandler()
184198
// swagger:operation GET /healthz operations healthz
185199
// ---
186200
// summary: Retrieves all registered health checkers for the process.
@@ -190,6 +204,40 @@ func (s *System) initializeHealthCheckHandler() {
190204
// '503':
191205
// description: Service unavailable.
192206
s.RegisterHandler("/healthz", s.healthHandler, false)
207+
208+
// Initialize readiness handler for Kubernetes readiness probes
209+
s.readinessHandler = healthz.NewReadinessHandler()
210+
// swagger:operation GET /readyz operations readyz
211+
// ---
212+
// summary: Retrieves readiness status for the process. Used by Kubernetes readiness probes.
213+
// responses:
214+
// '200':
215+
// description: Ready.
216+
// '503':
217+
// description: Not ready.
218+
s.RegisterHandler("/readyz", s.readinessHandler, false)
219+
}
220+
221+
// SetDetailedHealthEnabled enables or disables the detailed health endpoint.
222+
func (s *System) SetDetailedHealthEnabled(enabled bool) {
223+
if enabled && s.readinessHandler != nil {
224+
s.logger.Warn("Detailed health endpoint enabled; exposes internal component status. Ensure access is restricted via TLS and client authentication.")
225+
226+
detailedHandler := NewDetailedHealthHandler(
227+
s.readinessHandler,
228+
&healthHandlerAdapter{handler: s.healthHandler},
229+
30*time.Second,
230+
)
231+
s.RegisterHandler("/healthz/detailed", detailedHandler, s.options.TLS.Enabled)
232+
}
233+
}
234+
235+
type healthHandlerAdapter struct {
236+
handler *libhealthz.HealthHandler
237+
}
238+
239+
func (a *healthHandlerAdapter) RunChecks(ctx context.Context) []libhealthz.FailedCheck {
240+
return a.handler.RunChecks(ctx)
193241
}
194242

195243
func (s *System) initializeVersionInfoHandler() {

core/peer/config.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,35 @@ type ExternalBuilder struct {
4848
Path string `yaml:"path"`
4949
}
5050

51+
// HealthCheckConfig represents the configuration for readiness health checks
52+
type HealthCheckConfig struct {
53+
ReadinessTimeout time.Duration
54+
Gossip GossipHealthCheckConfig
55+
Ledger LedgerHealthCheckConfig
56+
Orderer OrdererHealthCheckConfig
57+
}
58+
59+
// GossipHealthCheckConfig represents configuration for gossip readiness checks
60+
type GossipHealthCheckConfig struct {
61+
Enabled bool
62+
MinPeers int
63+
Timeout time.Duration
64+
}
65+
66+
// LedgerHealthCheckConfig represents configuration for ledger readiness checks
67+
type LedgerHealthCheckConfig struct {
68+
Enabled bool
69+
Timeout time.Duration
70+
FailOnLag bool
71+
MaxLag uint64
72+
}
73+
74+
// OrdererHealthCheckConfig represents configuration for orderer connectivity checks
75+
type OrdererHealthCheckConfig struct {
76+
Enabled bool
77+
Timeout time.Duration
78+
}
79+
5180
// Config is the struct that defines the Peer configurations.
5281
type Config struct {
5382
// LocalMSPID is the identifier of the local MSP.
@@ -187,6 +216,12 @@ type Config struct {
187216
// trust for client authentication.
188217
OperationsTLSClientRootCAs []string
189218

219+
// OperationsHealthCheck provides configuration for readiness health checks
220+
OperationsHealthCheck HealthCheckConfig
221+
222+
// OperationsDetailedHealth enables the detailed health endpoint
223+
OperationsDetailedHealthEnabled bool
224+
190225
// ----- Metrics config -----
191226
// TODO: create separate sub-struct for Metrics config.
192227

@@ -321,6 +356,44 @@ func (c *Config) load() error {
321356
c.OperationsTLSClientRootCAs = append(c.OperationsTLSClientRootCAs, config.TranslatePath(configDir, rca))
322357
}
323358

359+
// Load health check configuration
360+
c.OperationsHealthCheck.ReadinessTimeout = viper.GetDuration("operations.healthCheck.readinessTimeout")
361+
if c.OperationsHealthCheck.ReadinessTimeout == 0 {
362+
c.OperationsHealthCheck.ReadinessTimeout = 10 * time.Second
363+
}
364+
365+
c.OperationsHealthCheck.Gossip.Enabled = viper.GetBool("operations.healthCheck.gossip.enabled")
366+
c.OperationsHealthCheck.Gossip.MinPeers = viper.GetInt("operations.healthCheck.gossip.minPeers")
367+
// Default minPeers to 0 to avoid false negatives in dev and single-node setups.
368+
// minPeers = 0 means "gossip initialized but no connectivity requirement".
369+
// Only enforce peer count when minPeers > 0.
370+
if !viper.IsSet("operations.healthCheck.gossip.minPeers") {
371+
c.OperationsHealthCheck.Gossip.MinPeers = 0
372+
}
373+
c.OperationsHealthCheck.Gossip.Timeout = viper.GetDuration("operations.healthCheck.gossip.timeout")
374+
if c.OperationsHealthCheck.Gossip.Timeout == 0 {
375+
c.OperationsHealthCheck.Gossip.Timeout = 5 * time.Second
376+
}
377+
378+
c.OperationsHealthCheck.Ledger.Enabled = viper.GetBool("operations.healthCheck.ledger.enabled")
379+
c.OperationsHealthCheck.Ledger.Timeout = viper.GetDuration("operations.healthCheck.ledger.timeout")
380+
if c.OperationsHealthCheck.Ledger.Timeout == 0 {
381+
c.OperationsHealthCheck.Ledger.Timeout = 5 * time.Second
382+
}
383+
c.OperationsHealthCheck.Ledger.FailOnLag = viper.GetBool("operations.healthCheck.ledger.failOnLag")
384+
c.OperationsHealthCheck.Ledger.MaxLag = viper.GetUint64("operations.healthCheck.ledger.maxLag")
385+
if c.OperationsHealthCheck.Ledger.MaxLag == 0 {
386+
c.OperationsHealthCheck.Ledger.MaxLag = 10 // Default max lag
387+
}
388+
389+
c.OperationsHealthCheck.Orderer.Enabled = viper.GetBool("operations.healthCheck.orderer.enabled")
390+
c.OperationsHealthCheck.Orderer.Timeout = viper.GetDuration("operations.healthCheck.orderer.timeout")
391+
if c.OperationsHealthCheck.Orderer.Timeout == 0 {
392+
c.OperationsHealthCheck.Orderer.Timeout = 5 * time.Second
393+
}
394+
395+
c.OperationsDetailedHealthEnabled = viper.GetBool("operations.healthCheck.detailedHealth.enabled")
396+
324397
c.MetricsProvider = viper.GetString("metrics.provider")
325398
c.StatsdNetwork = viper.GetString("metrics.statsd.network")
326399
c.StatsdAaddress = viper.GetString("metrics.statsd.address")

0 commit comments

Comments
 (0)