Skip to content

Commit

Permalink
Fix containerd-related path cleanup on failed bootstrap/join and asso…
Browse files Browse the repository at this point in the history
…ciated test. (#910)

---------

Signed-off-by: Nashwan Azhari <nashwan.azhari@canonical.com>
Co-authored-by: Berkay Tekin Oz <ozberkaytekin@gmail.com>
  • Loading branch information
Nashwan Azhari and berkayoz authored Dec 20, 2024
1 parent 71a4869 commit a278270
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 78 deletions.
22 changes: 13 additions & 9 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"database/sql"
"errors"
"fmt"
"io/fs"
"net"
"os"

Expand Down Expand Up @@ -177,17 +178,20 @@ func tryCleanupContainerdPaths(log log.Logger, s snap.Snap) {
}

// Check directory exists before attempting to remove:
if _, err := os.Stat(dirpath); os.IsNotExist(err) {
if stat, err := os.Stat(dirpath); os.IsNotExist(err) {
log.Info("Containerd directory doesn't exist; skipping cleanup", "directory", dirpath)
} else {
// NOTE(aznashwan): because of the convoluted interfaces-based way the snap
// composes and creates the original lockfiles (see k8sd/setup/containerd.go)
// this check is meant to defend against accidental code/configuration errors which
// might lead to the root FS being deleted:
realPath, err := os.Readlink(dirpath)
if err != nil {
log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath))
continue
realPath := dirpath
if stat.Mode()&fs.ModeSymlink != 0 {
// NOTE(aznashwan): because of the convoluted interfaces-based way the snap
// composes and creates the original lockfiles (see k8sd/setup/containerd.go)
// this check is meant to defend against accidental code/configuration errors which
// might lead to the root FS being deleted:
realPath, err = os.Readlink(dirpath)
if err != nil {
log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath))
continue
}
}

if realPath == "/" {
Expand Down
15 changes: 15 additions & 0 deletions src/k8s/pkg/k8sd/setup/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,21 @@ func defaultContainerdConfig(
// Containerd configures configuration and arguments for containerd on the local node.
// Optionally, a number of registry mirrors and auths can be configured.
func Containerd(snap snap.Snap, extraContainerdConfig map[string]any, extraArgs map[string]*string) error {
// We create the directories here since PreInitCheck is called before this
// This ensures we only create the directories if we are going to configure containerd
for _, dir := range []string{
snap.ContainerdConfigDir(),
snap.ContainerdExtraConfigDir(),
snap.ContainerdRegistryConfigDir(),
} {
if dir == "" {
continue
}
if err := os.MkdirAll(dir, 0o700); err != nil {
return fmt.Errorf("failed to create required directory: %w", err)
}
}

configToml := defaultContainerdConfig(
snap.CNIConfDir(),
snap.CNIBinDir(),
Expand Down
3 changes: 0 additions & 3 deletions src/k8s/pkg/k8sd/setup/directories.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ func EnsureAllDirectories(snap snap.Snap) error {

for _, dir := range []string{
snap.CNIConfDir(),
snap.ContainerdConfigDir(),
snap.ContainerdExtraConfigDir(),
snap.ContainerdRegistryConfigDir(),
snap.K8sDqliteStateDir(),
snap.KubernetesConfigDir(),
snap.KubernetesPKIDir(),
Expand Down
18 changes: 18 additions & 0 deletions tests/integration/templates/bootstrap-fail.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Contains the bootstrap configuration for the smoke test.
cluster-config:
network:
enabled: true
dns:
enabled: true
ingress:
enabled: true
load-balancer:
enabled: true
local-storage:
enabled: true
gateway:
enabled: true
metrics-server:
enabled: true
extra-node-kube-apiserver-args:
--foo: bar
52 changes: 20 additions & 32 deletions tests/integration/tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

LOG = logging.getLogger(__name__)

KUBE_CONTROLLER_MANAGER_SNAP_PORT = 10257
CONTAINERD_SOCKET_DIRECTORY_CLASSIC = "/run/containerd"

CONTAINERD_PATHS = [
"/etc/containerd",
"/run/containerd",
CONTAINERD_SOCKET_DIRECTORY_CLASSIC,
"/var/lib/containerd",
]
CNI_PATH = "/opt/cni/bin"
Expand Down Expand Up @@ -112,48 +112,36 @@ def test_node_cleanup_new_containerd_path(instances: List[harness.Instance]):


@pytest.mark.node_count(1)
@pytest.mark.no_setup()
@pytest.mark.disable_k8s_bootstrapping()
@pytest.mark.tags(tags.NIGHTLY)
@pytest.mark.skip(reason="the test fails when using a harness other than 'local'")
def test_containerd_path_cleanup_on_failed_init(
instances: List[harness.Instance], tmp_path
):
def test_containerd_path_cleanup_on_failed_init(instances: List[harness.Instance]):
"""Tests that a failed `bootstrap` properly cleans up any
containerd-related paths it may have created as part of the
failed `bootstrap`.
It induces a bootstrap failure by pre-binding a required k8s service
port (10257 for the kube-controller-manager) before running `k8s bootstrap`.
It introduces a bootstrap failure by supplying an incorrect argument to the kube-apiserver.
The bootstrap/join-cluster aborting behavior was added in this PR:
https://github.com/canonical/k8s-snap/pull/772
NOTE: a failed `join-cluster` will trigger the exact same cleanup
hook, so the test implicitly applies to it as well.
"""
instance = instances[0]
expected_code = 1
expected_message = (
"Encountered error(s) while verifying port availability for Kubernetes "
"services: Port 10257 (needed by: kube-controller-manager) is already in use."
)

with util.open_port(KUBE_CONTROLLER_MANAGER_SNAP_PORT) as _:
util.setup_k8s_snap(instance, tmp_path, config.SNAP, connect_interfaces=False)
fail_bootstrap_config = (config.MANIFESTS_DIR / "bootstrap-fail.yaml").read_text()

proc = instance.exec(
["k8s", "bootstrap"], capture_output=True, text=True, check=False
)

if proc.returncode != expected_code:
raise AssertionError(
f"Expected `k8s bootstrap` to exit with code {expected_code}, "
f"but it exited with {proc.returncode}.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)
proc = instance.exec(
["k8s", "bootstrap", "--file", "-"],
input=str.encode(fail_bootstrap_config),
check=False,
)

if expected_message not in proc.stderr:
raise AssertionError(
f"Expected to find port-related warning '{expected_message}' in "
"stderr of the `k8s bootstrap` command.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)
if proc.returncode != expected_code:
raise AssertionError(
f"Expected `k8s bootstrap` to exit with code {expected_code}, "
f"but it exited with {proc.returncode}.\n"
)

_assert_paths_not_exist(instance, CONTAINERD_PATHS)
_assert_paths_not_exist(instance, CONTAINERD_PATHS)
34 changes: 0 additions & 34 deletions tests/integration/tests/test_util/util.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#
# Copyright 2024 Canonical, Ltd.
#
import contextlib
import ipaddress
import json
import logging
import re
import shlex
import socket
import subprocess
import urllib.request
from datetime import datetime
Expand Down Expand Up @@ -542,38 +540,6 @@ def find_suitable_cidr(parent_cidr: str, excluded_ips: List[str]):
raise RuntimeError("Could not find a suitable CIDR for LoadBalancer services")


@contextlib.contextmanager
def open_port(
port: int,
host: str = "",
address_family: socket.AddressFamily = socket.AF_INET,
socket_kind: socket.SocketKind = socket.SOCK_STREAM,
max_backlogged_connections: int = 0,
):
"""Context manager which opens a socket with the given properties
and binds it to the given port.
Yields the already setup and listening socket object for use.
By default, it will only allow one single active connection
and instantly refuse any new ones. Use the `max_backlogged_connections`
argument if you'd like it to accept more connections as `pending`.
"""
sock = socket.socket(family=address_family, type=socket_kind)
if not host:
host = socket.gethostname()
sock.bind((host, port))
LOG.info(f"Successfully bound new socket on '{host}:{port}'")

try:
sock.listen(max_backlogged_connections)
LOG.info(f"Successfully started listening on '{host}:{port}'")
yield sock
finally:
sock.close()
LOG.info(f"Closed socket on '{host}:{port}'")


def check_file_paths_exist(
instance: harness.Instance, paths: List[str]
) -> Mapping[str, bool]:
Expand Down

0 comments on commit a278270

Please sign in to comment.