Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix containerd-related path cleanup on failed bootstrap/join and associated test. #910

Merged
merged 3 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"database/sql"
"errors"
"fmt"
"io/fs"
"net"
"os"

Expand Down Expand Up @@ -177,17 +178,20 @@ func tryCleanupContainerdPaths(log log.Logger, s snap.Snap) {
}

// Check directory exists before attempting to remove:
if _, err := os.Stat(dirpath); os.IsNotExist(err) {
if stat, err := os.Stat(dirpath); os.IsNotExist(err) {
log.Info("Containerd directory doesn't exist; skipping cleanup", "directory", dirpath)
} else {
// NOTE(aznashwan): because of the convoluted interfaces-based way the snap
// composes and creates the original lockfiles (see k8sd/setup/containerd.go)
// this check is meant to defend against accidental code/configuration errors which
// might lead to the root FS being deleted:
realPath, err := os.Readlink(dirpath)
if err != nil {
log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath))
continue
realPath := dirpath
if stat.Mode()&fs.ModeSymlink != 0 {
berkayoz marked this conversation as resolved.
Show resolved Hide resolved
// NOTE(aznashwan): because of the convoluted interfaces-based way the snap
// composes and creates the original lockfiles (see k8sd/setup/containerd.go)
// this check is meant to defend against accidental code/configuration errors which
// might lead to the root FS being deleted:
realPath, err = os.Readlink(dirpath)
if err != nil {
log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath))
continue
}
}

if realPath == "/" {
Expand Down
15 changes: 15 additions & 0 deletions src/k8s/pkg/k8sd/setup/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,21 @@ func defaultContainerdConfig(
// Containerd configures configuration and arguments for containerd on the local node.
// Optionally, a number of registry mirrors and auths can be configured.
func Containerd(snap snap.Snap, extraContainerdConfig map[string]any, extraArgs map[string]*string) error {
// We create the directories here since PreInitCheck is called before this
// This ensures we only create the directories if we are going to configure containerd
for _, dir := range []string{
snap.ContainerdConfigDir(),
snap.ContainerdExtraConfigDir(),
snap.ContainerdRegistryConfigDir(),
} {
if dir == "" {
continue
}
if err := os.MkdirAll(dir, 0o700); err != nil {
return fmt.Errorf("failed to create required directory: %w", err)
}
}

configToml := defaultContainerdConfig(
snap.CNIConfDir(),
snap.CNIBinDir(),
Expand Down
3 changes: 0 additions & 3 deletions src/k8s/pkg/k8sd/setup/directories.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ func EnsureAllDirectories(snap snap.Snap) error {

for _, dir := range []string{
snap.CNIConfDir(),
snap.ContainerdConfigDir(),
snap.ContainerdExtraConfigDir(),
snap.ContainerdRegistryConfigDir(),
snap.K8sDqliteStateDir(),
snap.KubernetesConfigDir(),
snap.KubernetesPKIDir(),
Expand Down
18 changes: 18 additions & 0 deletions tests/integration/templates/bootstrap-fail.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Contains the bootstrap configuration for the smoke test.
cluster-config:
network:
enabled: true
dns:
enabled: true
ingress:
enabled: true
load-balancer:
enabled: true
local-storage:
enabled: true
gateway:
enabled: true
metrics-server:
enabled: true
extra-node-kube-apiserver-args:
--foo: bar
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea, it hadn't crossed my mind to induce the failure with incorrect args!

52 changes: 20 additions & 32 deletions tests/integration/tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

LOG = logging.getLogger(__name__)

KUBE_CONTROLLER_MANAGER_SNAP_PORT = 10257
CONTAINERD_SOCKET_DIRECTORY_CLASSIC = "/run/containerd"

CONTAINERD_PATHS = [
"/etc/containerd",
"/run/containerd",
CONTAINERD_SOCKET_DIRECTORY_CLASSIC,
"/var/lib/containerd",
]
CNI_PATH = "/opt/cni/bin"
Expand Down Expand Up @@ -112,48 +112,36 @@ def test_node_cleanup_new_containerd_path(instances: List[harness.Instance]):


@pytest.mark.node_count(1)
@pytest.mark.no_setup()
@pytest.mark.disable_k8s_bootstrapping()
@pytest.mark.tags(tags.NIGHTLY)
@pytest.mark.skip(reason="the test fails when using a harness other than 'local'")
def test_containerd_path_cleanup_on_failed_init(
instances: List[harness.Instance], tmp_path
):
def test_containerd_path_cleanup_on_failed_init(instances: List[harness.Instance]):
"""Tests that a failed `bootstrap` properly cleans up any
containerd-related paths it may have created as part of the
failed `bootstrap`.

It induces a bootstrap failure by pre-binding a required k8s service
port (10257 for the kube-controller-manager) before running `k8s bootstrap`.
It introduces a bootstrap failure by supplying an incorrect argument to the kube-apiserver.

The bootstrap/join-cluster aborting behavior was added in this PR:
https://github.com/canonical/k8s-snap/pull/772

NOTE: a failed `join-cluster` will trigger the exact same cleanup
hook, so the test implicitly applies to it as well.
"""
instance = instances[0]
expected_code = 1
expected_message = (
"Encountered error(s) while verifying port availability for Kubernetes "
"services: Port 10257 (needed by: kube-controller-manager) is already in use."
)

with util.open_port(KUBE_CONTROLLER_MANAGER_SNAP_PORT) as _:
util.setup_k8s_snap(instance, tmp_path, config.SNAP, connect_interfaces=False)
fail_bootstrap_config = (config.MANIFESTS_DIR / "bootstrap-fail.yaml").read_text()

proc = instance.exec(
["k8s", "bootstrap"], capture_output=True, text=True, check=False
)

if proc.returncode != expected_code:
raise AssertionError(
f"Expected `k8s bootstrap` to exit with code {expected_code}, "
f"but it exited with {proc.returncode}.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)
proc = instance.exec(
["k8s", "bootstrap", "--file", "-"],
input=str.encode(fail_bootstrap_config),
check=False,
)

if expected_message not in proc.stderr:
raise AssertionError(
f"Expected to find port-related warning '{expected_message}' in "
"stderr of the `k8s bootstrap` command.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)
if proc.returncode != expected_code:
raise AssertionError(
f"Expected `k8s bootstrap` to exit with code {expected_code}, "
f"but it exited with {proc.returncode}.\n"
)

_assert_paths_not_exist(instance, CONTAINERD_PATHS)
_assert_paths_not_exist(instance, CONTAINERD_PATHS)
34 changes: 0 additions & 34 deletions tests/integration/tests/test_util/util.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#
# Copyright 2024 Canonical, Ltd.
#
import contextlib
import ipaddress
import json
import logging
import re
import shlex
import socket
import subprocess
import urllib.request
from datetime import datetime
Expand Down Expand Up @@ -518,38 +516,6 @@ def find_suitable_cidr(parent_cidr: str, excluded_ips: List[str]):
raise RuntimeError("Could not find a suitable CIDR for LoadBalancer services")


@contextlib.contextmanager
def open_port(
port: int,
host: str = "",
address_family: socket.AddressFamily = socket.AF_INET,
socket_kind: socket.SocketKind = socket.SOCK_STREAM,
max_backlogged_connections: int = 0,
):
"""Context manager which opens a socket with the given properties
and binds it to the given port.

Yields the already setup and listening socket object for use.

By default, it will only allow one single active connection
and instantly refuse any new ones. Use the `max_backlogged_connections`
argument if you'd like it to accept more connections as `pending`.
"""
sock = socket.socket(family=address_family, type=socket_kind)
if not host:
host = socket.gethostname()
sock.bind((host, port))
LOG.info(f"Successfully bound new socket on '{host}:{port}'")

try:
sock.listen(max_backlogged_connections)
LOG.info(f"Successfully started listening on '{host}:{port}'")
yield sock
finally:
sock.close()
LOG.info(f"Closed socket on '{host}:{port}'")


def check_file_paths_exist(
instance: harness.Instance, paths: List[str]
) -> Mapping[str, bool]:
Expand Down
Loading