Skip to content

Commit 25747e5

Browse files
committed
rpc: remove GRPCDial() and disallow anonymous non-gossip connections
The previous patch introduced node ID verification for GRPC connections but preserved the `GRPCDial()` API, alongside the ability to use node ID 0 with `GRPCDialNode()`, to signal that node ID verification should be disabled. Further examination revealed that this flexibility is 1) hard to reason about and 2) unneeded. So instead of keeping this option and then investing time into producing tests for all the combinations of verifications protocols, this patch "cuts the gordian knot" by removing this flexibility altogether. In summary: - `GRPCDial()` is removed. - `GRPCDialNode()` will call log.Fatal() if provided a 0 node ID. - `GRPCGossipDial()` is introduced, with a clarification about its contract. I have audited the code to validate that this is indeed only used by gossip, and the CLI client commands that really don't care about the node ID. Release note: None
1 parent 68ba844 commit 25747e5

File tree

17 files changed

+150
-80
lines changed

17 files changed

+150
-80
lines changed

pkg/cli/debug_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,10 @@ func TestRemoveDeadReplicas(t *testing.T) {
252252
tc := testcluster.StartTestCluster(t, 3, clusterArgs)
253253
defer tc.Stopper().Stop(ctx)
254254

255-
grpcConn, err := tc.Server(0).RPCContext().GRPCDial(tc.Server(0).ServingAddr()).Connect(ctx)
255+
grpcConn, err := tc.Server(0).RPCContext().GRPCDialNode(
256+
tc.Server(0).ServingAddr(),
257+
tc.Server(0).NodeID(),
258+
).Connect(ctx)
256259
if err != nil {
257260
t.Fatal(err)
258261
}

pkg/cli/start.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,9 @@ func getClientGRPCConn(ctx context.Context) (*grpc.ClientConn, *hlc.Clock, func(
11321132
stopper.Stop(ctx)
11331133
return nil, nil, nil, err
11341134
}
1135-
conn, err := rpcContext.GRPCDial(addr).Connect(ctx)
1135+
// We use GRPCGossipDial() here because it does not matter
1136+
// to which node we're talking to.
1137+
conn, err := rpcContext.GRPCGossipDial(addr).Connect(ctx)
11361138
if err != nil {
11371139
stopper.Stop(ctx)
11381140
return nil, nil, nil, err

pkg/gossip/client.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func (c *client) startLocked(
109109
// asynchronous from the caller's perspective, so the only effect of
110110
// `WithBlock` here is blocking shutdown - at the time of this writing,
111111
// that ends ups up making `kv` tests take twice as long.
112-
conn, err := rpcCtx.GRPCDial(c.addr.String()).Connect(ctx)
112+
conn, err := rpcCtx.GRPCGossipDial(c.addr.String()).Connect(ctx)
113113
if err != nil {
114114
return err
115115
}

pkg/gossip/gossip_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ func TestGossipNoForwardSelf(t *testing.T) {
461461
c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, local.GetNodeAddr(), makeMetrics())
462462

463463
testutils.SucceedsSoon(t, func() error {
464-
conn, err := peer.rpcContext.GRPCDial(c.addr.String()).Connect(ctx)
464+
conn, err := peer.rpcContext.GRPCGossipDial(c.addr.String()).Connect(ctx)
465465
if err != nil {
466466
return err
467467
}

pkg/kv/send_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ func TestSendToOneClient(t *testing.T) {
6666
stopper,
6767
&cluster.MakeTestingClusterSettings().Version,
6868
)
69+
70+
// This test uses the testing function sendBatch() which does not
71+
// support setting the node ID on GRPCDialNode(). Disable Node ID
72+
// checks to avoid log.Fatal.
73+
rpcContext.TestingAllowNamedRPCToAnonymousServer = true
74+
6975
s := rpc.NewServer(rpcContext)
7076
roachpb.RegisterInternalServer(s, Node(0))
7177
ln, err := netutil.ListenAndServeGRPC(rpcContext.Stopper, s, util.TestAddr)
@@ -136,6 +142,10 @@ func TestComplexScenarios(t *testing.T) {
136142
stopper,
137143
&cluster.MakeTestingClusterSettings().Version,
138144
)
145+
146+
// We're going to serve multiple node IDs with that one
147+
// context. Disable node ID checks.
148+
nodeContext.TestingAllowNamedRPCToAnonymousServer = true
139149
nodeDialer := nodedialer.New(nodeContext, nil)
140150

141151
// TODO(bdarnell): the retryable flag is no longer used for RPC errors.

pkg/rpc/context.go

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,10 @@ func (ctx *Context) GetStatsMap() *syncmap.Map {
459459

460460
// GetLocalInternalClientForAddr returns the context's internal batch client
461461
// for target, if it exists.
462-
func (ctx *Context) GetLocalInternalClientForAddr(target string) roachpb.InternalClient {
463-
if target == ctx.AdvertiseAddr {
462+
func (ctx *Context) GetLocalInternalClientForAddr(
463+
target string, nodeID roachpb.NodeID,
464+
) roachpb.InternalClient {
465+
if target == ctx.AdvertiseAddr && nodeID == ctx.NodeID.Get() {
464466
return ctx.localInternalClient
465467
}
466468
return nil
@@ -695,21 +697,28 @@ func (ctx *Context) GRPCDialRaw(target string) (*grpc.ClientConn, <-chan struct{
695697
return conn, dialer.redialChan, err
696698
}
697699

698-
// GRPCDial calls grpc.Dial with options appropriate for the context.
699-
//
700-
// It does not require validation of the node ID between client and server:
701-
// if a connection existed already with some node ID requirement, that
702-
// requirement will remain; if no connection existed yet,
703-
// a new one is created without a node ID requirement.
704-
func (ctx *Context) GRPCDial(target string) *Connection {
705-
return ctx.GRPCDialNode(target, 0)
700+
// GRPCGossipDial uses GRPCDialNode and disables validation of the
701+
// node ID between client and server. This function should only be
702+
// used with the gossip client and CLI commands which can talk to any
703+
// node.
704+
func (ctx *Context) GRPCGossipDial(target string) *Connection {
705+
return ctx.grpcDialNodeInternal(target, 0)
706706
}
707707

708708
// GRPCDialNode calls grpc.Dial with options appropriate for the context.
709709
//
710-
// The remoteNodeID, if non-zero, becomes a constraint on the expected
711-
// node ID of the remote node; this is checked during heartbeats.
710+
// The remoteNodeID becomes a constraint on the expected node ID of
711+
// the remote node; this is checked during heartbeats. The caller is
712+
// responsible for ensuring the remote node ID is known prior to using
713+
// this function.
712714
func (ctx *Context) GRPCDialNode(target string, remoteNodeID roachpb.NodeID) *Connection {
715+
if remoteNodeID == 0 && !ctx.TestingAllowNamedRPCToAnonymousServer {
716+
log.Fatalf(context.TODO(), "invalid node ID 0 in GRPCDialNode()")
717+
}
718+
return ctx.grpcDialNodeInternal(target, remoteNodeID)
719+
}
720+
721+
func (ctx *Context) grpcDialNodeInternal(target string, remoteNodeID roachpb.NodeID) *Connection {
713722
thisConnKey := connKey{target, remoteNodeID}
714723
value, ok := ctx.conns.Load(thisConnKey)
715724
if !ok {
@@ -765,7 +774,7 @@ func (ctx *Context) NewBreaker(name string) *circuit.Breaker {
765774
// the first heartbeat.
766775
var ErrNotHeartbeated = errors.New("not yet heartbeated")
767776

768-
// ConnHealth returns nil if we have an open connection to the given
777+
// TestingConnHealth returns nil if we have an open connection to the given
769778
// target that succeeded on its most recent heartbeat. Otherwise, it
770779
// kicks off a connection attempt (unless one is already in progress
771780
// or we are in a backoff state) and returns an error (typically
@@ -776,13 +785,15 @@ var ErrNotHeartbeated = errors.New("not yet heartbeated")
776785
// "unhealthy" nodes.
777786
//
778787
// This is used in tests only; in clusters use (*Dialer).ConnHealth()
779-
// instead which validates the node ID.
780-
func (ctx *Context) ConnHealth(target string) error {
781-
if ctx.GetLocalInternalClientForAddr(target) != nil {
788+
// instead which automates the address resolution.
789+
//
790+
// TODO(knz): remove this altogether. Use the dialer in all cases.
791+
func (ctx *Context) TestingConnHealth(target string, nodeID roachpb.NodeID) error {
792+
if ctx.GetLocalInternalClientForAddr(target, nodeID) != nil {
782793
// The local server is always considered healthy.
783794
return nil
784795
}
785-
conn := ctx.GRPCDial(target)
796+
conn := ctx.GRPCDialNode(target, nodeID)
786797
return conn.Health()
787798
}
788799

0 commit comments

Comments
 (0)