Skip to content

Commit

Permalink
fix: ensure containerd-related directories removed on failed `bootstr…
Browse files Browse the repository at this point in the history
…ap/join-cluster`. (#863)

* fix: ensure containerd-related directories removed on failed `bootstrap/join-cluster`

`k8sd` automatically sets up some directories with the appropriate
ownership/permissions to be used by containerd in the early stages
of the `bootstrap` and `join-cluster` commands.

In the classic (non-strict) version of the k8s-snap, these
containerd directories are system-wide (e.g. `/etc/containerd`,
`/run/containerd`, etc).

Should any of the other setup steps fail after the containerd
directories were set up, the directories would still remain on
disk and thus lead to a 'partial installation' of on the host system.

This patch ensures that `k8s` will automatically remove any
containerd-related directories which were created in the event of
the  `bootstrap` / `join-cluster` commands failing.

Signed-off-by: Nashwan Azhari <nashwan.azhari@canonical.com>

* fix: ensure containerd Base Dir lockfile is never accidentally deleted.

The containerd Base Dir is the special path all other containerd-related
paths on the snap are derived from.

Under classic confinement and default settings, this path defaults
to the host's root (`/`), and thus extreme care must be taken to
not accidentally include it in k8sd's cleanup routine or the
k8s-snap's remove hook.

Signed-off-by: Nashwan Azhari <nashwan.azhari@canonical.com>

---------

Signed-off-by: Nashwan Azhari <nashwan.azhari@canonical.com>
  • Loading branch information
Nashwan Azhari authored Dec 16, 2024
1 parent f98694b commit eb495a0
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 21 deletions.
12 changes: 10 additions & 2 deletions k8s/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,16 @@ k8s::remove::containerd() {
# this is to prevent removing containerd when it is not installed by the snap.
# NOTE: do NOT include .containerd-base-dir! By default, it will contain "/".
for file in "containerd-socket-path" "containerd-config-dir" "containerd-root-dir" "containerd-cni-bin-dir"; do
if [ -f "$SNAP_COMMON/lock/$file" ]; then
rm -rf $(cat "$SNAP_COMMON/lock/$file")
local lockpath="$SNAP_COMMON/lock/$file"
if [ -f "$lockpath" ]; then
local dirpath=$(cat "$SNAP_COMMON/lock/$file")

if [ $(readlink -m "$dirpath") = "/" ]; then
echo "WARN: lockfile '$lockpath' points to root ('/'). Skipping cleanup."
continue
fi

rm -rf "$dirpath"
fi
done
}
Expand Down
59 changes: 59 additions & 0 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/canonical/k8s/pkg/k8sd/pki"
"github.com/canonical/k8s/pkg/k8sd/setup"
"github.com/canonical/k8s/pkg/log"
"github.com/canonical/k8s/pkg/snap"
snaputil "github.com/canonical/k8s/pkg/snap/util"
"github.com/canonical/k8s/pkg/utils/control"
"github.com/canonical/microcluster/v2/cluster"
Expand Down Expand Up @@ -145,5 +146,63 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
}
}

tryCleanupContainerdPaths(log, snap)

return nil
}

// tryCleanupContainerdPaths attempts to clean up all containerd directories which were
// created by the k8s-snap based on the existence of their respective lockfiles
// located in the directory returned by `s.LockFilesDir()`.
func tryCleanupContainerdPaths(log log.Logger, s snap.Snap) {
for lockpath, dirpath := range setup.ContainerdLockPathsForSnap(s) {
// Ensure lockfile exists:
if _, err := os.Stat(lockpath); os.IsNotExist(err) {
log.Info("WARN: failed to find containerd lockfile, no cleanup will be perfomed", "lockfile", lockpath, "directory", dirpath)
continue
}

// Ensure lockfile's contents is the one we expect:
lockfile_contents := ""
if contents, err := os.ReadFile(lockpath); err != nil {
log.Info("WARN: failed to read contents of lockfile", "lockfile", lockpath, "error", err)
continue
} else {
lockfile_contents = string(contents)
}

if lockfile_contents != dirpath {
log.Info("WARN: lockfile points to different path than expected", "lockfile", lockpath, "expected", dirpath, "actual", lockfile_contents)
continue
}

// Check directory exists before attempting to remove:
if _, err := os.Stat(dirpath); os.IsNotExist(err) {
log.Info("Containerd directory doesn't exist; skipping cleanup", "directory", dirpath)
} else {
// NOTE(aznashwan): because of the convoluted interfaces-based way the snap
// composes and creates the original lockfiles (see k8sd/setup/containerd.go)
// this check is meant to defend against accidental code/configuration errors which
// might lead to the root FS being deleted:
realPath, err := os.Readlink(dirpath)
if err != nil {
log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath))
continue
}

if realPath == "/" {
log.Error(fmt.Errorf("There is some configuration/logic error in the current versions of the k8s-snap related to lockfile %q (meant to lock %q, which points to %q) which could lead to accidental wiping of the root file system.", lockpath, dirpath, realPath), "Please report this issue upstream immediately.")
continue
}

if err := os.RemoveAll(dirpath); err != nil {
log.Info("WARN: failed to remove containerd data directory", "directory", dirpath, "error", err, "realPath", realPath)
continue // Avoid removing the lockfile path.
}
}

if err := os.Remove(lockpath); err != nil {
log.Info("WARN: Failed to remove containerd lockfile", "lockfile", lockpath)
}
}
}
37 changes: 31 additions & 6 deletions src/k8s/pkg/k8sd/setup/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,21 +166,46 @@ func Containerd(snap snap.Snap, extraContainerdConfig map[string]any, extraArgs
return nil
}

func saveSnapContainerdPaths(s snap.Snap) error {
// Write the containerd-related paths to files to properly clean-up on removal.
// ContainerdLockPathsForSnap returns a mapping between the absolute paths of
// the lockfiles within the k8s snap and the absolute paths of the containerd
// directory they lock.
//
// WARN: these lockfiles are meant to be used in later cleanup stages.
// DO NOT include any system paths which are not managed by the k8s-snap!
//
// It intentionally does NOT include the containerd base dir lockfile
// (which most of the rest of the paths are based on), as it is meant
// to indicate the root of the containerd install ('/' or '/var/snap/k8s/*').
func ContainerdLockPathsForSnap(s snap.Snap) map[string]string {
m := map[string]string{
"containerd-socket-path": s.ContainerdSocketDir(),
"containerd-config-dir": s.ContainerdConfigDir(),
"containerd-root-dir": s.ContainerdRootDir(),
"containerd-cni-bin-dir": s.CNIBinDir(),
snap.ContainerdBaseDir: s.GetContainerdBaseDir(),
}

for filename, content := range m {
if err := utils.WriteFile(filepath.Join(s.LockFilesDir(), filename), []byte(content), 0o600); err != nil {
return fmt.Errorf("failed to write %s: %w", filename, err)
prefixed := map[string]string{}
for k, v := range m {
prefixed[filepath.Join(s.LockFilesDir(), k)] = v
}

return prefixed
}

// saveSnapContainerdPaths creates the lock files for the containerd directory paths to be used for later cleanup.
func saveSnapContainerdPaths(s snap.Snap) error {
for lockpath, dirpath := range ContainerdLockPathsForSnap(s) {
if err := utils.WriteFile(lockpath, []byte(dirpath), 0o600); err != nil {
return fmt.Errorf("failed to write %s: %w", lockpath, err)
}
}

// Save the Containerd Base Dir separately:
baseDirPath := filepath.Join(s.LockFilesDir(), snap.ContainerdBaseDir)
if err := utils.WriteFile(baseDirPath, []byte(s.GetContainerdBaseDir()), 0o600); err != nil {
return fmt.Errorf("failed to write %s: %w", baseDirPath, err)
}

return nil
}

Expand Down
12 changes: 10 additions & 2 deletions src/k8s/pkg/k8sd/setup/containerd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func TestContainerd(t *testing.T) {
ContainerdRegistryConfigDir: filepath.Join(dir, "containerd-hosts"),
ContainerdStateDir: filepath.Join(dir, "containerd-state"),
ContainerdExtraConfigDir: filepath.Join(dir, "containerd-confd"),
LockFilesDir: filepath.Join(dir, "lockfiles"),
ServiceArgumentsDir: filepath.Join(dir, "args"),
CNIBinDir: filepath.Join(dir, "opt-cni-bin"),
CNIConfDir: filepath.Join(dir, "cni-netd"),
Expand Down Expand Up @@ -129,9 +130,16 @@ func TestContainerd(t *testing.T) {
"containerd-root-dir": s.ContainerdRootDir(),
"containerd-cni-bin-dir": s.CNIBinDir(),
}
for filename, content := range m {

b, err := os.ReadFile(filepath.Join(s.LockFilesDir(), filename))
// Ensure locks directory exists:
files, err := os.ReadDir(s.LockFilesDir())
g.Expect(err).To(Not(HaveOccurred()))
t.Logf("Lockfiles currently in %q: %v", s.LockFilesDir(), files)

for filename, content := range m {
path := filepath.Join(s.LockFilesDir(), filename)
t.Logf("Checking path: %s", path)
b, err := os.ReadFile(path)
g.Expect(err).To(Not(HaveOccurred()))
g.Expect(string(b)).To(Equal(content))
}
Expand Down
75 changes: 64 additions & 11 deletions tests/integration/tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

LOG = logging.getLogger(__name__)

KUBE_CONTROLLER_MANAGER_SNAP_PORT = 10257

CONTAINERD_PATHS = [
"/etc/containerd",
"/run/containerd",
Expand All @@ -19,9 +21,22 @@
CNI_PATH = "/opt/cni/bin"


def _assert_paths_not_exist(instance: harness.Instance, paths: List[str]):
paths_which_exist = [
p
for p, exists in util.check_file_paths_exist(instance, paths).items()
if exists
]
if paths_which_exist:
raise AssertionError(
f"Expected the following path(s) to not exist: {paths_which_exist}"
)


@pytest.mark.node_count(1)
@pytest.mark.tags(tags.NIGHTLY)
def test_node_cleanup(instances: List[harness.Instance], tmp_path):
"""Verifies that a `snap remove k8s` will perform proper cleanup."""
instance = instances[0]
util.wait_for_dns(instance)
util.wait_for_network(instance)
Expand All @@ -30,11 +45,7 @@ def test_node_cleanup(instances: List[harness.Instance], tmp_path):

# Check that the containerd-related folders are removed on snap removal.
all_paths = CONTAINERD_PATHS + [CNI_PATH]
process = instance.exec(
["ls", *all_paths], capture_output=True, text=True, check=False
)
for path in all_paths:
assert f"cannot access '{path}': No such file or directory" in process.stderr
_assert_paths_not_exist(instance, all_paths)

util.setup_k8s_snap(instance, tmp_path)
instance.exec(["k8s", "bootstrap"])
Expand Down Expand Up @@ -88,18 +99,60 @@ def test_node_cleanup_new_containerd_path(instances: List[harness.Instance]):
assert (
f"cannot access '{path}': No such file or directory" in process.stderr
)
_assert_paths_not_exist(instance, exp_missing_paths)

# Check that the containerd-related folders are in the new locations.
# If one of them is missing, this should have a non-zero exit code.
instance.exec(["ls", *new_containerd_paths], check=True)

for instance in instances:
# Check that the containerd-related folders are not in the new locations after snap removal.
util.remove_k8s_snap(instance)
process = instance.exec(
["ls", *new_containerd_paths], capture_output=True, text=True, check=False
# Check that the containerd-related folders are not in the new locations after snap removal.
_assert_paths_not_exist(instance, new_containerd_paths)


@pytest.mark.node_count(1)
@pytest.mark.no_setup()
@pytest.mark.tags(tags.NIGHTLY)
def test_containerd_path_cleanup_on_failed_init(
instances: List[harness.Instance], tmp_path
):
"""Tests that a failed `bootstrap` properly cleans up any
containerd-related paths it may have created as part of the
failed `bootstrap`.
It induces a bootstrap failure by pre-binding a required k8s service
port (10257 for the kube-controller-manager) before running `k8s bootstrap`.
NOTE: a failed `join-cluster` will trigger the exact same cleanup
hook, so the test implicitly applies to it as well.
"""
instance = instances[0]
expected_code = 1
expected_message = (
"Encountered error(s) while verifying port availability for Kubernetes "
"services: Port 10257 (needed by: kube-controller-manager) is already in use."
)

with util.open_port(KUBE_CONTROLLER_MANAGER_SNAP_PORT) as _:
util.setup_k8s_snap(instance, tmp_path, config.SNAP, connect_interfaces=False)

proc = instance.exec(
["k8s", "bootstrap"], capture_output=True, text=True, check=False
)
for path in new_containerd_paths:
assert (
f"cannot access '{path}': No such file or directory" in process.stderr

if proc.returncode != expected_code:
raise AssertionError(
f"Expected `k8s bootstrap` to exit with code {expected_code}, "
f"but it exited with {proc.returncode}.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)

if expected_message not in proc.stderr:
raise AssertionError(
f"Expected to find port-related warning '{expected_message}' in "
"stderr of the `k8s bootstrap` command.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)

_assert_paths_not_exist(instance, CONTAINERD_PATHS)
50 changes: 50 additions & 0 deletions tests/integration/tests/test_util/util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#
# Copyright 2024 Canonical, Ltd.
#
import contextlib
import ipaddress
import json
import logging
import re
import shlex
import socket
import subprocess
import urllib.request
from datetime import datetime
Expand Down Expand Up @@ -514,3 +516,51 @@ def find_suitable_cidr(parent_cidr: str, excluded_ips: List[str]):

return str(lb_net)
raise RuntimeError("Could not find a suitable CIDR for LoadBalancer services")


@contextlib.contextmanager
def open_port(
port: int,
host: str = "",
address_family: socket.AddressFamily = socket.AF_INET,
socket_kind: socket.SocketKind = socket.SOCK_STREAM,
max_backlogged_connections: int = 0,
):
"""Context manager which opens a socket with the given properties
and binds it to the given port.
Yields the already setup and listening socket object for use.
By default, it will only allow one single active connection
and instantly refuse any new ones. Use the `max_backlogged_connections`
argument if you'd like it to accept more connections as `pending`.
"""
sock = socket.socket(family=address_family, type=socket_kind)
if not host:
host = socket.gethostname()
sock.bind((host, port))
LOG.info(f"Successfully bound new socket on '{host}:{port}'")

try:
sock.listen(max_backlogged_connections)
LOG.info(f"Successfully started listening on '{host}:{port}'")
yield sock
finally:
sock.close()
LOG.info(f"Closed socket on '{host}:{port}'")


def check_file_paths_exist(
instance: harness.Instance, paths: List[str]
) -> Mapping[str, bool]:
"""Returns whether the given path(s) exist within the given harness instance
by checking the output of a single `ls` command containing all of them.
It is recommended to always use absolute paths, as the cwd relative to
which the `ls` will get executed depends on the harness instance.
"""
process = instance.exec(["ls", *paths], capture_output=True, text=True, check=False)
return {
p: not (f"cannot access '{p}': No such file or directory" in process.stderr)
for p in paths
}

0 comments on commit eb495a0

Please sign in to comment.