Skip to content

Commit

Permalink
Check new controllers against etcd member-list to detect replaced hos…
Browse files Browse the repository at this point in the history
…ts (#714)

* Check new controllers against etcd member-list to detect replaced hosts

Signed-off-by: Kimmo Lehto <[email protected]>

* Add a smoke-test for controller swap

Signed-off-by: Kimmo Lehto <[email protected]>

* Dont really run etcd leave on dry-run

Signed-off-by: Kimmo Lehto <[email protected]>

---------

Signed-off-by: Kimmo Lehto <[email protected]>
  • Loading branch information
kke authored Jun 10, 2024
1 parent 35777a2 commit 68b97cd
Show file tree
Hide file tree
Showing 11 changed files with 294 additions and 9 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/smoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,34 @@ jobs:
env:
LINUX_IMAGE: ${{ matrix.image }}
run: make smoke-backup-restore

smoke-controller-swap:
strategy:
matrix:
image:
- quay.io/k0sproject/bootloose-alpine3.18

name: Controller swap
needs: build
runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod
check-latest: true

- {"name":"Compiled binary cache","uses":"actions/download-artifact@v4","with":{"name":"k0sctl","path":"."}}
- {"name":"K0sctl cache","uses":"actions/cache@v3","with":{"path":"/var/cache/k0sctl/k0s\n~/.cache/k0sctl/k0s\n","key":"k0sctl-cache"}}
- {"name":"Kubectl cache","uses":"actions/cache@v3","with":{"path":"smoke-test/kubectl\n","key":"kubectl-${{ hashFiles('smoke-test/smoke.common.sh') }}","restore-keys":"kubectl-"}}
- {"name":"Make binaries executable","run":"chmod +x k0sctl || true\nchmod +x smoke-test/kubectl || true"}

- name: Run smoke tests
env:
LINUX_IMAGE: ${{ matrix.image }}
run: make smoke-controller-swap

smoke-init:
name: Init sub-command smoke test
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ build-all: $(addprefix bin/,$(bins)) bin/checksums.md
clean:
rm -rf bin/ k0sctl

smoketests := smoke-basic smoke-basic-rootless smoke-files smoke-upgrade smoke-reset smoke-os-override smoke-init smoke-backup-restore smoke-dynamic smoke-basic-openssh smoke-dryrun smoke-downloadurl
smoketests := smoke-basic smoke-basic-rootless smoke-files smoke-upgrade smoke-reset smoke-os-override smoke-init smoke-backup-restore smoke-dynamic smoke-basic-openssh smoke-dryrun smoke-downloadurl smoke-controller-swap
.PHONY: $(smoketests)
$(smoketests): k0sctl
$(MAKE) -C smoke-test $@
Expand Down
4 changes: 3 additions & 1 deletion phase/configure_k0s.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ func (p *ConfigureK0s) generateDefaultConfig() (string, error) {

// Run the phase
func (p *ConfigureK0s) Run() error {
controllers := p.Config.Spec.Hosts.Controllers()
controllers := p.Config.Spec.Hosts.Controllers().Filter(func(h *cluster.Host) bool {
return !h.Reset && len(h.Metadata.K0sNewConfig) > 0
})
return p.parallelDo(controllers, p.configureK0s)
}

Expand Down
95 changes: 95 additions & 0 deletions phase/gather_k0s_facts.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"context"
"encoding/json"
"fmt"
"net"
"net/url"
"path"
"strings"

Expand Down Expand Up @@ -78,6 +80,10 @@ func (p *GatherK0sFacts) Run() error {
p.SetProp("clusterID", id)
}

if err := p.investigateEtcd(); err != nil {
return err
}

var workers cluster.Hosts = p.hosts.Workers()
if err := p.parallelDo(workers, p.investigateK0s); err != nil {
return err
Expand All @@ -86,6 +92,95 @@ func (p *GatherK0sFacts) Run() error {
return nil
}

func (p *GatherK0sFacts) isInternalEtcd() bool {
if p.leader.Role != "controller" && p.leader.Role != "controller+worker" {
return false
}

if p.leader.Metadata.K0sRunningVersion == nil {
return false
}

if p.Config.Spec.K0s == nil || p.Config.Spec.K0s.Config == nil {
log.Debugf("%s: k0s config not found, expecting default internal etcd", p.leader)
return true
}

log.Debugf("%s: checking storage config for etcd", p.leader)
if storageConfig, ok := p.Config.Spec.K0s.Config.Dig("spec", "storage").(dig.Mapping); ok {
storageType := storageConfig.DigString("type")
switch storageType {
case "etcd":
if _, ok := storageConfig.Dig("etcd", "externalCluster").(dig.Mapping); ok {
log.Debugf("%s: storage is configured with external etcd", p.leader)
return false
}
log.Debugf("%s: storage type is etcd", p.leader)
return true
case "":
log.Debugf("%s: storage type is default", p.leader)
return true
default:
log.Debugf("%s: storage type is %s", p.leader, storageType)
return false
}
}

log.Debugf("%s: storage config not found, expecting default internal etcd", p.leader)
return true
}

func (p *GatherK0sFacts) investigateEtcd() error {
if !p.isInternalEtcd() {
log.Debugf("%s: skipping etcd member list", p.leader)
return nil
}

if err := p.listEtcdMembers(p.leader); err != nil {
return err
}

return nil
}

func (p *GatherK0sFacts) listEtcdMembers(h *cluster.Host) error {
log.Infof("%s: listing etcd members", h)
// etcd member-list outputs json like:
// {"members":{"controller0":"https://172.17.0.2:2380","controller1":"https://172.17.0.3:2380"}}
// on versions like ~1.21.x etcd member-list outputs to stderr with extra fields (from logrus).
output, err := h.ExecOutput(h.Configurer.K0sCmdf("etcd member-list --data-dir=%s 2>&1", h.K0sDataDir()), exec.Sudo(h))
if err != nil {
return fmt.Errorf("failed to run list etcd members command: %w", err)
}

result := make(map[string]any)
if err := json.Unmarshal([]byte(output), &result); err != nil {
return fmt.Errorf("failed to decode etcd member-list output: %w", err)
}

etcdMembers := []string{}
if members, ok := result["members"].(map[string]any); ok {
for _, urlField := range members {
urlFieldStr, ok := urlField.(string)
if ok {
memberURL, err := url.Parse(urlFieldStr)
if err != nil {
return fmt.Errorf("failed to parse etcd member URL: %w", err)
}
memberHost, _, err := net.SplitHostPort(memberURL.Host)
if err != nil {
return fmt.Errorf("failed to split etcd member URL: %w", err)
}
log.Debugf("%s: detected etcd member %s", h, memberHost)
etcdMembers = append(etcdMembers, memberHost)
}
}
}

p.Config.Metadata.EtcdMembers = etcdMembers
return nil
}

func (p *GatherK0sFacts) investigateK0s(h *cluster.Host) error {
output, err := h.ExecOutput(h.Configurer.K0sCmdf("version"), exec.Sudo(h))
if err != nil {
Expand Down
6 changes: 1 addition & 5 deletions phase/reset_controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,7 @@ func (p *ResetControllers) Run() error {
if !p.NoLeave {
log.Debugf("%s: leaving etcd...", h)

etcdAddress := h.SSH.Address
if h.PrivateAddress != "" {
etcdAddress = h.PrivateAddress
}
if err := h.Exec(h.Configurer.K0sCmdf("etcd leave --peer-address %s --datadir %s", etcdAddress, h.K0sDataDir()), exec.Sudo(h)); err != nil {
if err := h.Exec(h.Configurer.K0sCmdf("etcd leave --peer-address %s --datadir %s", h.PrivateAddress, h.K0sDataDir()), exec.Sudo(h)); err != nil {
log.Warnf("%s: failed to leave etcd: %s", h, err.Error())
}
log.Debugf("%s: leaving etcd completed", h)
Expand Down
49 changes: 49 additions & 0 deletions phase/validate_facts.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package phase

import (
"fmt"
"slices"

log "github.com/sirupsen/logrus"
)
Expand All @@ -27,6 +28,10 @@ func (p *ValidateFacts) Run() error {
return err
}

if err := p.validateControllerSwap(); err != nil {
return err
}

return nil
}

Expand Down Expand Up @@ -69,3 +74,47 @@ func (p *ValidateFacts) validateDefaultVersion() error {

return nil
}

func (p *ValidateFacts) validateControllerSwap() error {
log.Debugf("validating controller list vs etcd member list")
if p.Config.Spec.K0sLeader().Metadata.K0sRunningVersion == nil {
log.Debugf("%s: leader has no k0s running, assuming a fresh cluster", p.Config.Spec.K0sLeader())
return nil
}

if p.Config.Spec.K0sLeader().Role == "single" {
log.Debugf("%s: leader is a single node, assuming no etcd", p.Config.Spec.K0sLeader())
return nil
}

if len(p.Config.Metadata.EtcdMembers) > len(p.Config.Spec.Hosts.Controllers()) {
log.Warnf("there are more etcd members in the cluster than controllers listed in the k0sctl configuration")
}

for _, h := range p.Config.Spec.Hosts.Controllers() {
if h.Metadata.K0sRunningVersion != nil {
log.Debugf("%s: host has k0s running, no need to check if it was replaced", h)
continue
}

log.Debugf("%s: host is new, checking if etcd members list already contains %s", h, h.PrivateAddress)
if slices.Contains(p.Config.Metadata.EtcdMembers, h.PrivateAddress) {
if Force {
log.Infof("%s: force used, running 'k0s etcd leave' for the host", h)
leader := p.Config.Spec.K0sLeader()
leaveCommand := leader.Configurer.K0sCmdf("etcd leave --peer-address %s", h.PrivateAddress)
err := p.Wet(h, fmt.Sprintf("remove host from etcd using %v", leaveCommand), func() error {
return leader.Exec(leaveCommand)
})
if err != nil {
return fmt.Errorf("controller %s is listed as an existing etcd member but k0s is not found installed on it, the host may have been replaced. attempted etcd leave for the address %s but it failed: %w", h, h.PrivateAddress, err)
}
continue
}
return fmt.Errorf("controller %s is listed as an existing etcd member but k0s is not found installed on it, the host may have been replaced. check the host and use `k0s etcd leave --peer-address %s on a controller or re-run apply with --force", h, h.PrivateAddress)
}
log.Debugf("%s: no match, assuming its safe to install", h)
}

return nil
}
5 changes: 3 additions & 2 deletions pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ const APIVersion = "k0sctl.k0sproject.io/v1beta1"

// ClusterMetadata defines cluster metadata
type ClusterMetadata struct {
Name string `yaml:"name" validate:"required" default:"k0s-cluster"`
Kubeconfig string `yaml:"-"`
Name string `yaml:"name" validate:"required" default:"k0s-cluster"`
Kubeconfig string `yaml:"-"`
EtcdMembers []string `yaml:"-"`
}

// Cluster describes launchpad.yaml configuration
Expand Down
3 changes: 3 additions & 0 deletions smoke-test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,8 @@ smoke-downloadurl: $(bootloose) id_rsa_k0s k0sctl
smoke-backup-restore: $(bootloose) id_rsa_k0s k0sctl
./smoke-backup-restore.sh

smoke-controller-swap: $(bootloose) id_rsa_k0s k0sctl
BOOTLOOSE_TEMPLATE=bootloose-controller-swap.yaml.tpl K0SCTL_CONFIG=k0sctl-controller-swap.yaml ./smoke-controller-swap.sh

%.iid: Dockerfile.%
docker build --iidfile '$@' - < '$<'
23 changes: 23 additions & 0 deletions smoke-test/bootloose-controller-swap.yaml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cluster:
name: k0s
privateKey: ./id_rsa_k0s
machines:
- count: 3
backend: docker
spec:
image: $LINUX_IMAGE
name: manager%d
privileged: true
volumes:
- type: bind
source: /lib/modules
destination: /lib/modules
- type: volume
destination: /var/lib/k0s
portMappings:
- containerPort: 22
hostPort: 9022
- containerPort: 443
hostPort: 443
- containerPort: 6443
hostPort: 6443
29 changes: 29 additions & 0 deletions smoke-test/k0sctl-controller-swap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: k0sctl.k0sproject.io/v1beta1
kind: cluster
spec:
hosts:
- role: controller
uploadBinary: true
ssh:
address: "127.0.0.1"
port: 9022
keyPath: ./id_rsa_k0s
- role: controller
uploadBinary: true
ssh:
address: "127.0.0.1"
port: 9023
keyPath: ./id_rsa_k0s
- role: controller
uploadBinary: true
ssh:
address: "127.0.0.1"
port: 9024
keyPath: ./id_rsa_k0s
k0s:
version: "${K0S_VERSION}"
config:
spec:
telemetry:
enabled: false

59 changes: 59 additions & 0 deletions smoke-test/smoke-controller-swap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env sh

K0SCTL_CONFIG=${K0SCTL_CONFIG:-"k0sctl-controller-swap.yaml"}

set -ex


. ./smoke.common.sh
trap cleanup EXIT

deleteCluster
createCluster

echo "* Starting apply"
../k0sctl apply --config "${K0SCTL_CONFIG}" --debug
echo "* Apply OK"

echo "* Get the ip of the last controller"
controllerip=$(bootloose show manager2 -o json | grep '"ip"' | head -1 | cut -d'"' -f4)

echo "* Wipe controller 3"
docker rm -fv "$(bootloose show manager2 -o json | grep '"container"' | head -1 | cut -d'"' -f4)"

echo "* Verify its gone"
bootloose show manager2 | grep "Not created"

echo "* Recreate controller2"
createCluster

echo "* Verify its back and IP is the same"
bootloose show manager2 | grep "Running"
newip=$(bootloose show manager2 -o json | grep '"ip"' | head -1 | cut -d'"' -f4)
if [ "$controllerip" != "$newip" ]; then
echo "IP mismatch: $controllerip != $newip - ip should get reused"
exit 1
fi

echo "* Re-apply should fail because of known hosts"
if ../k0sctl apply --config "${K0SCTL_CONFIG}" --debug; then
echo "Re-apply should have failed because of known hosts"
exit 1
fi

echo "* Clear known hosts"
truncate -s 0 ~/.ssh/known_hosts

echo "* Re-apply should fail because of replaced controller"
if ../k0sctl apply --config "${K0SCTL_CONFIG}" --debug; then
echo "Re-apply should have failed because of replaced controller"
exit 1
fi

echo "* Perform etcd member removal"
bootloose ssh root@manager0 -- k0s etcd leave --peer-address "$controllerip"

echo "* Re-apply should succeed"
../k0sctl apply --config "${K0SCTL_CONFIG}" --debug

echo "* Done"

0 comments on commit 68b97cd

Please sign in to comment.