Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion cmd/gpud/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,19 @@ sudo rm /etc/systemd/system/gpud.service
Flags: []cli.Flag{
&cli.StringFlag{
Name: "endpoint",
Usage: "endpoint for control plane",
Usage: "(optional) endpoint for control plane",
Hidden: true,
},
&cli.StringFlag{
Name: "machine-id",
Hidden: true,
Usage: "(optional) for override default machine id",
},
&cli.StringFlag{
Name: "token",
Hidden: true,
Usage: "(optional) token for control plane",
},
&cli.StringFlag{
Name: "log-level,l",
Usage: "set the logging level [debug, info, warn, error, fatal, panic, dpanic]",
Expand Down
46 changes: 46 additions & 0 deletions cmd/gpud/run/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ import (
"github.com/leptonai/gpud/pkg/config"
gpudmanager "github.com/leptonai/gpud/pkg/gpud-manager"
"github.com/leptonai/gpud/pkg/log"
pkgmetadata "github.com/leptonai/gpud/pkg/metadata"
pkgnfschecker "github.com/leptonai/gpud/pkg/nfs-checker"
"github.com/leptonai/gpud/pkg/nvidia-query/infiniband"
gpudserver "github.com/leptonai/gpud/pkg/server"
pkgsqlite "github.com/leptonai/gpud/pkg/sqlite"
pkgsystemd "github.com/leptonai/gpud/pkg/systemd"
"github.com/leptonai/gpud/version"
)
Expand Down Expand Up @@ -65,6 +67,11 @@ func Command(cliContext *cli.Context) error {
ibClassRootDir := cliContext.String("infiniband-class-root-dir")
components := cliContext.String("components")

// Optional overrides for control plane connectivity
endpoint := cliContext.String("endpoint")
overrideMachineID := cliContext.String("machine-id")
token := cliContext.String("token")

gpuCount := cliContext.Int("gpu-count")
infinibandExpectedPortStates := cliContext.String("infiniband-expected-port-states")
nfsCheckerConfigs := cliContext.String("nfs-checker-configs")
Expand Down Expand Up @@ -174,6 +181,45 @@ func Command(cliContext *cli.Context) error {
return err
}

// Persist overrides to metadata for subsequent sessions.
if endpoint != "" || overrideMachineID != "" || token != "" {
mctx, mcancel := context.WithTimeout(context.Background(), 10*time.Second)
defer mcancel()

dbRW, err := pkgsqlite.Open(cfg.State)
if err != nil {
return fmt.Errorf("failed to open state for metadata overrides: %w", err)
}
defer func() {
_ = dbRW.Close()
}()

if err := pkgmetadata.CreateTableMetadata(mctx, dbRW); err != nil {
return fmt.Errorf("failed to ensure metadata table: %w", err)
}

if endpoint != "" {
if err := pkgmetadata.SetMetadata(mctx, dbRW, pkgmetadata.MetadataKeyEndpoint, endpoint); err != nil {
return fmt.Errorf("failed to set endpoint metadata: %w", err)
}
log.Logger.Infow("overriding endpoint from flag", "endpoint", endpoint)
}

if overrideMachineID != "" {
if err := pkgmetadata.SetMetadata(mctx, dbRW, pkgmetadata.MetadataKeyMachineID, overrideMachineID); err != nil {
return fmt.Errorf("failed to set machine-id metadata: %w", err)
}
log.Logger.Infow("overriding machine id from flag", "machineID", overrideMachineID)
}

if token != "" {
if err := pkgmetadata.SetMetadata(mctx, dbRW, pkgmetadata.MetadataKeyToken, token); err != nil {
return fmt.Errorf("failed to set token metadata: %w", err)
}
log.Logger.Infow("overriding token from flag")
}
}

rootCtx, rootCancel := context.WithCancel(context.Background())
defer rootCancel()

Expand Down
Loading