diff --git a/controlplane/api/v1beta2/ck8scontrolplane_types.go b/controlplane/api/v1beta2/ck8scontrolplane_types.go index b0209858..074de64b 100644 --- a/controlplane/api/v1beta2/ck8scontrolplane_types.go +++ b/controlplane/api/v1beta2/ck8scontrolplane_types.go @@ -34,9 +34,6 @@ const ( // This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP. CK8sServerConfigurationAnnotation = "controlplane.cluster.x-k8s.io/ck8s-server-configuration" - // SkipCoreDNSAnnotation annotation explicitly skips reconciling CoreDNS if set. - SkipCoreDNSAnnotation = "controlplane.cluster.x-k8s.io/skip-coredns" - // RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more // specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement. // NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to diff --git a/controlplane/api/v1beta2/condition_consts.go b/controlplane/api/v1beta2/condition_consts.go index 65bbe02d..e25e1164 100644 --- a/controlplane/api/v1beta2/condition_consts.go +++ b/controlplane/api/v1beta2/condition_consts.go @@ -97,30 +97,6 @@ const ( PodInspectionFailedReason = "PodInspectionFailed" ) -const ( - // EtcdClusterHealthyCondition documents the overall etcd cluster's health. - EtcdClusterHealthyCondition clusterv1.ConditionType = "EtcdClusterHealthyCondition" - - // EtcdClusterInspectionFailedReason documents a failure in inspecting the etcd cluster status. - EtcdClusterInspectionFailedReason = "EtcdClusterInspectionFailed" - - // EtcdClusterUnknownReason reports an etcd cluster in unknown status. - EtcdClusterUnknownReason = "EtcdClusterUnknown" - - // EtcdClusterUnhealthyReason (Severity=Error) is set when the etcd cluster is unhealthy. - EtcdClusterUnhealthyReason = "EtcdClusterUnhealthy" - - // MachineEtcdMemberHealthyCondition report the machine's etcd member's health status. - // NOTE: This conditions exists only if a stacked etcd cluster is used. - MachineEtcdMemberHealthyCondition clusterv1.ConditionType = "EtcdMemberHealthy" - - // EtcdMemberInspectionFailedReason documents a failure in inspecting the etcd member status. - EtcdMemberInspectionFailedReason = "MemberInspectionFailed" - - // EtcdMemberUnhealthyReason (Severity=Error) documents a Machine's etcd member is unhealthy. - EtcdMemberUnhealthyReason = "EtcdMemberUnhealthy" -) - const ( // TokenAvailableCondition documents whether the token required for nodes to join the cluster is available. TokenAvailableCondition clusterv1.ConditionType = "TokenAvailable" diff --git a/controlplane/controllers/ck8scontrolplane_controller.go b/controlplane/controllers/ck8scontrolplane_controller.go index c029ec3f..3da984b8 100644 --- a/controlplane/controllers/ck8scontrolplane_controller.go +++ b/controlplane/controllers/ck8scontrolplane_controller.go @@ -18,12 +18,12 @@ package controllers import ( "context" + "errors" "fmt" "strings" "time" "github.com/go-logr/logr" - "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -196,7 +196,7 @@ func (r *CK8sControlPlaneReconciler) reconcileDelete(ctx context.Context, cluste return reconcile.Result{}, err } - // Updates conditions reporting the status of static pods and the status of the etcd cluster. + // Updates conditions reporting the status of static pods // NOTE: Ignoring failures given that we are deleting if err := r.reconcileControlPlaneConditions(ctx, controlPlane); err != nil { logger.Info("failed to reconcile conditions", "error", err.Error()) @@ -498,25 +498,19 @@ func (r *CK8sControlPlaneReconciler) reconcile(ctx context.Context, cluster *clu } if err := r.syncMachines(ctx, kcp, controlPlane); err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to sync Machines") + return ctrl.Result{}, fmt.Errorf("failed to sync Machines: %w", err) } // Aggregate the operational state of all the machines; while aggregating we are adding the // source ref (reason@machine/name) so the problem can be easily tracked down to its source machine. conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef(), conditions.WithStepCounterIf(false)) - // Updates conditions reporting the status of static pods and the status of the etcd cluster. + // Updates conditions reporting the status of static pods // NOTE: Conditions reporting KCP operation progress like e.g. Resized or SpecUpToDate are inlined with the rest of the execution. if err := r.reconcileControlPlaneConditions(ctx, controlPlane); err != nil { return reconcile.Result{}, err } - // Ensures the number of etcd members is in sync with the number of machines/nodes. - // NOTE: This is usually required after a machine deletion. - if err := r.reconcileEtcdMembers(ctx, controlPlane); err != nil { - return reconcile.Result{}, err - } - // Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate, // otherwise continue with the other KCP operations. if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() { @@ -562,26 +556,6 @@ func (r *CK8sControlPlaneReconciler) reconcile(ctx context.Context, cluster *clu return r.scaleDownControlPlane(ctx, cluster, kcp, controlPlane, collections.Machines{}) } - // Get the workload cluster client. - /** - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster)) - if err != nil { - logger.V(2).Info("cannot get remote client to workload cluster, will requeue", "cause", err) - return ctrl.Result{Requeue: true}, nil - } - - // Update kube-proxy daemonset. - if err := workloadCluster.UpdateKubeProxyImageInfo(ctx, kcp); err != nil { - logger.Error(err, "failed to update kube-proxy daemonset") - return reconcile.Result{}, err - } - - // Update CoreDNS deployment. - if err := workloadCluster.UpdateCoreDNS(ctx, kcp); err != nil { - return reconcile.Result{}, fmt.Errorf("failed to update CoreDNS deployment") - } - **/ - return reconcile.Result{}, nil } @@ -661,8 +635,7 @@ func (r *CK8sControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, cl return reconcile.Result{}, nil } -// reconcileControlPlaneConditions is responsible of reconciling conditions reporting the status of static pods and -// the status of the etcd cluster. +// reconcileControlPlaneConditions is responsible of reconciling conditions reporting the status of static pods. func (r *CK8sControlPlaneReconciler) reconcileControlPlaneConditions(ctx context.Context, controlPlane *ck8s.ControlPlane) error { // If the cluster is not yet initialized, there is no way to connect to the workload cluster and fetch information // for updating conditions. Return early. @@ -678,7 +651,6 @@ func (r *CK8sControlPlaneReconciler) reconcileControlPlaneConditions(ctx context // Update conditions status workloadCluster.UpdateAgentConditions(ctx, controlPlane) - workloadCluster.UpdateEtcdConditions(ctx, controlPlane) // Patch machines with the updated conditions. if err := controlPlane.PatchMachines(ctx); err != nil { @@ -722,66 +694,6 @@ func (r *CK8sControlPlaneReconciler) syncMachines(ctx context.Context, kcp *cont return nil } -// reconcileEtcdMembers ensures the number of etcd members is in sync with the number of machines/nodes. -// This is usually required after a machine deletion. -// -// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this. -func (r *CK8sControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context, controlPlane *ck8s.ControlPlane) error { - // NOTE(neoaggelos): Upstream uses this to reach the etcd cluster and remove any members that have not yet - // been removed, typically after a machine has been deleted. In the case of k8s-dqlite, this is handled automatically - // for us, so we do not need to do anything here. - // - // We still leave this code around in case we need to do work in the future (e.g. make sure any removed nodes do not - // still appear on microcluster or k8s-dqlite). - - /** - log := ctrl.LoggerFrom(ctx) - // If k8s-dqlite is not managed by KCP this is a no-op. - if !controlPlane.IsEtcdManaged() { - return nil - } - - // If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet. - if controlPlane.Machines.Len() == 0 { - return nil - } - - // Collect all the node names. - nodeNames := []string{} - for _, machine := range controlPlane.Machines { - if machine.Status.NodeRef == nil { - // If there are provisioning machines (machines without a node yet), return. - return nil - } - nodeNames = append(nodeNames, machine.Status.NodeRef.Name) - } - - // Potential inconsistencies between the list of members and the list of machines/nodes are - // surfaced using the EtcdClusterHealthyCondition; if this condition is true, meaning no inconsistencies exists, return early. - if conditions.IsTrue(controlPlane.KCP, controlplanev1.EtcdClusterHealthyCondition) { - return nil - } - - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster)) - if err != nil { - // Failing at connecting to the workload cluster can mean workload cluster is unhealthy for a variety of reasons such as etcd quorum loss. - return errors.Wrap(err, "cannot get remote client to workload cluster") - } - - removedMembers, err := workloadCluster.ReconcileEtcdMembers(ctx, nodeNames) - if err != nil { - return errors.Wrap(err, "failed attempt to reconcile etcd members") - } - - if len(removedMembers) > 0 { - log.Info("Etcd members without nodes removed from the cluster", "members", removedMembers) - } - - **/ - - return nil -} - func (r *CK8sControlPlaneReconciler) upgradeControlPlane( ctx context.Context, cluster *clusterv1.Cluster, diff --git a/controlplane/controllers/machine_controller.go b/controlplane/controllers/machine_controller.go index 20659c9d..4e2fbf21 100644 --- a/controlplane/controllers/machine_controller.go +++ b/controlplane/controllers/machine_controller.go @@ -2,10 +2,10 @@ package controllers import ( "context" + "fmt" "time" "github.com/go-logr/logr" - "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -34,15 +34,10 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag For(&clusterv1.Machine{}). Build(r) - // NOTE(neoaggelos): See note below if r.managementCluster == nil { r.managementCluster = &ck8s.Management{ Client: r.Client, K8sdDialTimeout: r.K8sdDialTimeout, - /* - EtcdDialTimeout: r.EtcdDialTimeout, - EtcdCallTimeout: r.EtcdCallTimeout, - */ } } @@ -51,6 +46,7 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters;clusters/status,verbs=get;list;watch // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;create;update;patch;delete + func (r *MachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := r.Log.WithValues("namespace", req.Namespace, "machine", req.Name) @@ -95,42 +91,16 @@ func (r *MachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Note that this currently makes the annotation a no-op in the code here. However, we still keep the logic in the code is case it // is needed in the future. - /** - cluster, err := util.GetClusterFromMetadata(ctx, r.Client, m.ObjectMeta) - if err != nil { - logger.Info("unable to get cluster.") - return ctrl.Result{}, errors.Wrapf(err, "unable to get cluster") - } - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster)) - if err != nil { - logger.Error(err, "failed to create client to workload cluster") - return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") - } - - etcdRemoved, err := workloadCluster.RemoveEtcdMemberForMachine(ctx, m) - if err != nil { - logger.Error(err, "failed to remove etcd member for machine") - return ctrl.Result{}, err - } - if !etcdRemoved { - logger.Info("wait embedded etcd controller to remove etcd") - return ctrl.Result{Requeue: true}, err - } - - // It is possible that the machine has no machine ref yet, will record the machine name in log - logger.Info("etcd remove etcd member succeeded", "machine name", m.Name) - **/ - patchHelper, err := patch.NewHelper(m, r.Client) if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to create patch helper for machine") + return ctrl.Result{}, fmt.Errorf("failed to create patch helper for machine: %w", err) } mAnnotations := m.GetAnnotations() delete(mAnnotations, clusterv1.PreTerminateDeleteHookAnnotationPrefix) m.SetAnnotations(mAnnotations) if err := patchHelper.Patch(ctx, m); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed patch machine") + return ctrl.Result{}, fmt.Errorf("failed to patch machine: %w", err) } } diff --git a/controlplane/controllers/remediation.go b/controlplane/controllers/remediation.go index 2baa7c9b..8139a16d 100644 --- a/controlplane/controllers/remediation.go +++ b/controlplane/controllers/remediation.go @@ -23,7 +23,6 @@ import ( "time" "github.com/go-logr/logr" - "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/klog/v2" @@ -56,7 +55,7 @@ func (r *CK8sControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Cont m.DeletionTimestamp.IsZero() { patchHelper, err := patch.NewHelper(m, r.Client) if err != nil { - errList = append(errList, errors.Wrapf(err, "failed to get PatchHelper for machine %s", m.Name)) + errList = append(errList, fmt.Errorf("failed to get PatchHelper for machine %s: %w", m.Name, err)) continue } @@ -65,7 +64,7 @@ func (r *CK8sControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Cont if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ clusterv1.MachineOwnerRemediatedCondition, }}); err != nil { - errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", m.Name)) + errList = append(errList, fmt.Errorf("failed to patch machine %s: %w", m.Name, err)) } } } @@ -117,7 +116,7 @@ func (r *CK8sControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Cont }}); err != nil { log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name) if retErr == nil { - retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name) + retErr = fmt.Errorf("failed to patch control plane Machine %s: %w", machineToBeRemediated.Name, err) } } }() @@ -167,74 +166,7 @@ func (r *CK8sControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Cont // so that the cluster does not lock and cause the cluster to go down. In the case of k8s-dqlite, this is automatically handled by the // go-dqlite layer, and Canonical Kubernetes has logic to automatically keep a quorum of nodes in normal operation. // - // Therefore, we currently disable this check for simplicity, but should remember that we need this precondition before proceeing. - - /** - // Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd - // losing a majority of members and thus become unable to field new requests. - if controlPlane.IsEtcdManaged() { - canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated) - if err != nil { - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return ctrl.Result{}, err - } - if !canSafelyRemediate { - log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation") - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") - return ctrl.Result{}, nil - } - } - **/ - - // Start remediating the unhealthy control plane machine by deleting it. - // A new machine will come up completing the operation as part of the regular reconcile. - - // NOTE(neoaggelos): Here, upstream will check whether the node that is about to be removed is the leader of the etcd cluster, and will - // attempt to forward the leadership to a different active node before proceeding. This is so that continuous operation of the cluster - // is preserved. - // - // TODO(neoaggelos): For Canonical Kubernetes, we should instead use the RemoveNode endpoint of the k8sd service from a different control - // plane node (through the k8sd-proxy), which will handle this operation for us. If that fails, we must not proceed. - - /** - // If the control plane is initialized, before deleting the machine: - // - if the machine hosts the etcd leader, forward etcd leadership to another machine. - // - delete the etcd member hosted on the machine being deleted. - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster)) - if err != nil { - log.Error(err, "Failed to create client to workload cluster") - return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") - } - - // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. - if controlPlane.IsEtcdManaged() { - etcdLeaderCandidate := controlPlane.HealthyMachines().Newest() - if etcdLeaderCandidate == nil { - log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, - "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") - return ctrl.Result{}, nil - } - if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil { - log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate)) - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return ctrl.Result{}, err - } - - patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client) - if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to create patch helper for machine") - } - - mAnnotations := machineToBeRemediated.GetAnnotations() - mAnnotations[clusterv1.PreTerminateDeleteHookAnnotationPrefix] = ck8sHookName - machineToBeRemediated.SetAnnotations(mAnnotations) - - if err := patchHelper.Patch(ctx, machineToBeRemediated); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed patch machine for adding preTerminate hook") - } - } - **/ + // Therefore, we have removed this check for simplicity, but should remember that we need this precondition before proceeing. } microclusterPort := controlPlane.KCP.Spec.CK8sConfigSpec.ControlPlaneConfig.GetMicroclusterPort() @@ -242,7 +174,7 @@ func (r *CK8sControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Cont workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, clusterObjectKey, microclusterPort) if err != nil { log.Error(err, "failed to create client to workload cluster") - return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") + return ctrl.Result{}, fmt.Errorf("failed to create client to workload cluster: %w", err) } if err := workloadCluster.RemoveMachineFromCluster(ctx, machineToBeRemediated); err != nil { @@ -252,7 +184,7 @@ func (r *CK8sControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Cont // Delete the machine if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil { conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) + return ctrl.Result{}, fmt.Errorf("failed to delete unhealthy machine %s: %w", machineToBeRemediated.Name, err) } // Surface the operation is in progress. @@ -381,110 +313,6 @@ func maxDuration(x, y time.Duration) time.Duration { return x } -// NOTE(neoaggelos): See note above. Implementation kept here for future reference, only remove once the NOTEs and TODOs in the reconcileUnhealthyMachines -// have been fully addressed and are well-tested. - -//nolint:godot -/** -// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated -// without loosing etcd quorum. -// -// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according -// to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance): -// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target -// cluster size after deletion is 2, fault tolerance 0) -// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target -// cluster size after deletion is 4, fault tolerance 1) -// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target -// cluster size after deletion is 6, fault tolerance 2) -// - etc. -// -// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers -// as well as reconcileControlPlaneConditions before this. -// -// adapted from kubeadm controller and makes the assumption that the set of controplane nodes equals the set of etcd nodes. -func (r *CK8sControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *ck8s.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) { - log := ctrl.LoggerFrom(ctx) - - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster)) - if err != nil { - return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name) - } - - // Gets the etcd status - - // This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions. - etcdMembers, err := workloadCluster.EtcdMembers(ctx) - if err != nil { - return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name) - } - - currentTotalMembers := len(etcdMembers) - - log.Info("etcd cluster before remediation", - "currentTotalMembers", currentTotalMembers) - - // Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated. - targetTotalMembers := 0 - targetUnhealthyMembers := 0 - - healthyMembers := []string{} - unhealthyMembers := []string{} - for _, etcdMember := range etcdMembers { - // Skip the machine to be deleted because it won't be part of the target etcd cluster. - if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember { - continue - } - - // Include the member in the target etcd cluster. - targetTotalMembers++ - - // Search for the machine corresponding to the etcd member. - var machine *clusterv1.Machine - for _, m := range controlPlane.Machines { - if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember { - machine = m - break - } - } - - // If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health, - // so KCP is assuming the worst scenario and considering the member unhealthy. - // - // NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method. - if machine == nil { - log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember) - targetUnhealthyMembers++ - unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember)) - continue - } - - // Check member health as reported by machine's health conditions - if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) { - targetUnhealthyMembers++ - unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) - continue - } - - healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) - } - - // See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation. - targetQuorum := (targetTotalMembers / 2.0) + 1 - canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum - - log.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name), - "healthyMembers", healthyMembers, - "unhealthyMembers", unhealthyMembers, - "targetTotalMembers", targetTotalMembers, - "targetQuorum", targetQuorum, - "targetUnhealthyMembers", targetUnhealthyMembers, - "canSafelyRemediate", canSafelyRemediate) - - return canSafelyRemediate, nil -} -**/ - // RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP // during remediation and then into the RemediationForAnnotation on the replacement machine once it is created. type RemediationData struct { @@ -503,7 +331,7 @@ type RemediationData struct { func RemediationDataFromAnnotation(value string) (*RemediationData, error) { ret := &RemediationData{} if err := json.Unmarshal([]byte(value), ret); err != nil { - return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason) + return nil, fmt.Errorf("failed to unmarshal value %s for %s annotation: %w", value, clusterv1.RemediationInProgressReason, err) } return ret, nil } @@ -512,7 +340,7 @@ func RemediationDataFromAnnotation(value string) (*RemediationData, error) { func (r *RemediationData) Marshal() (string, error) { b, err := json.Marshal(r) if err != nil { - return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason) + return "", fmt.Errorf("failed to marshal value for %s annotation: %w", clusterv1.RemediationInProgressReason, err) } return string(b), nil } diff --git a/controlplane/controllers/scale.go b/controlplane/controllers/scale.go index de0f3777..1b31f50d 100644 --- a/controlplane/controllers/scale.go +++ b/controlplane/controllers/scale.go @@ -19,10 +19,10 @@ package controllers import ( "context" "encoding/json" + "errors" "fmt" "strings" - "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -119,52 +119,12 @@ func (r *CK8sControlPlaneReconciler) scaleDownControlPlane( return ctrl.Result{}, fmt.Errorf("failed to pick control plane Machine to delete: %w", err) } - // NOTE(neoaggelos): Here, upstream will check whether the node that is about to be removed is the leader of the etcd cluster, and will - // attempt to forward the leadership to a different active node before proceeding. This is so that continuous operation of the cluster - // is preserved. - // - // TODO(neoaggelos): For Canonical Kubernetes, we should instead use the RemoveNode endpoint of the k8sd service from a different control - // plane node (through the k8sd-proxy), which will handle this operation for us. If that fails, we must not proceed. - // - // Finally, note that upstream only acts if the cluster has a managed etcd. For Canonical Kubernetes, we must always perform this action, - // since we must delete the node from microcluster as well. - - /** - // If KCP should manage etcd, If etcd leadership is on machine that is about to be deleted, move it to the newest member available. - if controlPlane.IsEtcdManaged() { - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster)) - if err != nil { - logger.Error(err, "Failed to create client to workload cluster") - return ctrl.Result{}, fmt.Errorf("failed to create client to workload cluster: %w", err) - } - - etcdLeaderCandidate := controlPlane.Machines.Newest() - if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil { - logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name) - return ctrl.Result{}, err - } - - patchHelper, err := patch.NewHelper(machineToDelete, r.Client) - if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to create patch helper for machine") - } - - mAnnotations := machineToDelete.GetAnnotations() - mAnnotations[clusterv1.PreTerminateDeleteHookAnnotationPrefix] = ck8sHookName - machineToDelete.SetAnnotations(mAnnotations) - - if err := patchHelper.Patch(ctx, machineToDelete); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed patch machine for adding preTerminate hook") - } - } - **/ - microclusterPort := controlPlane.KCP.Spec.CK8sConfigSpec.ControlPlaneConfig.GetMicroclusterPort() clusterObjectKey := util.ObjectKey(cluster) workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, clusterObjectKey, microclusterPort) if err != nil { logger.Error(err, "failed to create client to workload cluster") - return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") + return ctrl.Result{}, fmt.Errorf("failed to create client to workload cluster: %w", err) } if err := workloadCluster.RemoveMachineFromCluster(ctx, machineToDelete); err != nil { @@ -208,11 +168,6 @@ func (r *CK8sControlPlaneReconciler) preflightChecks(_ context.Context, controlP // Check machine health conditions; if there are conditions with False or Unknown, then wait. allMachineHealthConditions := []clusterv1.ConditionType{controlplanev1.MachineAgentHealthyCondition} - if controlPlane.IsEtcdManaged() { - allMachineHealthConditions = append(allMachineHealthConditions, - controlplanev1.MachineEtcdMemberHealthyCondition, - ) - } machineErrors := []error{} diff --git a/go.mod b/go.mod index 6d719655..52c72399 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/onsi/ginkgo v1.16.5 github.com/onsi/ginkgo/v2 v2.17.1 github.com/onsi/gomega v1.32.0 - github.com/pkg/errors v0.9.1 google.golang.org/protobuf v1.33.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.29.3 @@ -92,6 +91,7 @@ require ( github.com/opencontainers/image-spec v1.0.2 // indirect github.com/pelletier/go-toml v1.9.5 // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.18.0 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.45.0 // indirect diff --git a/pkg/ck8s/control_plane.go b/pkg/ck8s/control_plane.go index 0861d959..a9080001 100644 --- a/pkg/ck8s/control_plane.go +++ b/pkg/ck8s/control_plane.go @@ -340,7 +340,6 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error { if helper, ok := c.machinesPatchHelpers[machine.Name]; ok { if err := helper.Patch(ctx, machine, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ controlplanev1.MachineAgentHealthyCondition, - controlplanev1.MachineEtcdMemberHealthyCondition, }}); err != nil { errList = append(errList, fmt.Errorf("failed to patch machine %s: %w", machine.Name, err)) } diff --git a/pkg/ck8s/management_cluster.go b/pkg/ck8s/management_cluster.go index 2af847f1..51dcb6f9 100644 --- a/pkg/ck8s/management_cluster.go +++ b/pkg/ck8s/management_cluster.go @@ -104,69 +104,9 @@ func (m *Management) GetWorkloadCluster(ctx context.Context, clusterKey client.O ClientRestConfig: restConfig, K8sdClientGenerator: g, microclusterPort: microclusterPort, - - /** - CoreDNSMigrator: &CoreDNSMigrator{}, - **/ - } - // NOTE(neoaggelos): Upstream creates an etcd client generator, so that users can reach etcd on each node. - // - // TODO(neoaggelos): For Canonical Kubernetes, we need to create a client generator for the k8sd endpoints on the control plane nodes. - - /** - // Retrieves the etcd CA key Pair - crtData, keyData, err := m.getEtcdCAKeyPair(ctx, clusterKey) - if err != nil { - return nil, err } - // If etcd CA is not nil, then it's managed etcd - if crtData != nil { - clientCert, err := generateClientCert(crtData, keyData) - if err != nil { - return nil, err - } - - caPool := x509.NewCertPool() - caPool.AppendCertsFromPEM(crtData) - tlsConfig := &tls.Config{ - RootCAs: caPool, - Certificates: []tls.Certificate{clientCert}, - MinVersion: tls.VersionTLS12, - } - tlsConfig.InsecureSkipVerify = true - workload.etcdClientGenerator = NewEtcdClientGenerator(restConfig, tlsConfig, m.EtcdDialTimeout, m.EtcdCallTimeout) - } - **/ - return workload, nil } -//nolint:godot -/** -func (m *Management) getEtcdCAKeyPair(ctx context.Context, clusterKey client.ObjectKey) ([]byte, []byte, error) { - etcdCASecret := &corev1.Secret{} - etcdCAObjectKey := client.ObjectKey{ - Namespace: clusterKey.Namespace, - Name: fmt.Sprintf("%s-etcd", clusterKey.Name), - } - - // Try to get the certificate via the uncached client. - if err := m.Client.Get(ctx, etcdCAObjectKey, etcdCASecret); err != nil { - if apierrors.IsNotFound(err) { - return nil, nil, nil - } else { - return nil, nil, errors.Wrapf(err, "failed to get secret; etcd CA bundle %s/%s", etcdCAObjectKey.Namespace, etcdCAObjectKey.Name) - } - } - - crtData, ok := etcdCASecret.Data[secret.TLSCrtDataName] - if !ok { - return nil, nil, errors.Errorf("etcd tls crt does not exist for cluster %s/%s", clusterKey.Namespace, clusterKey.Name) - } - keyData := etcdCASecret.Data[secret.TLSKeyDataName] - return crtData, keyData, nil -} -**/ - var _ ManagementCluster = &Management{} diff --git a/pkg/ck8s/workload_cluster.go b/pkg/ck8s/workload_cluster.go index 769144f8..89e0a550 100644 --- a/pkg/ck8s/workload_cluster.go +++ b/pkg/ck8s/workload_cluster.go @@ -9,7 +9,6 @@ import ( "strings" apiv1 "github.com/canonical/k8s-snap-api/api/v1" - "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -31,10 +30,6 @@ const ( k8sdConfigSecretName = "k8sd-config" //nolint:gosec ) -var ( - ErrControlPlaneMinNodes = errors.New("cluster has fewer than 2 control plane nodes; removing an etcd member is not supported") -) - // WorkloadCluster defines all behaviors necessary to upgrade kubernetes on a workload cluster // // TODO: Add a detailed description to each of these method definitions. @@ -42,28 +37,10 @@ type WorkloadCluster interface { // Basic health and status checks. ClusterStatus(ctx context.Context) (ClusterStatus, error) UpdateAgentConditions(ctx context.Context, controlPlane *ControlPlane) - UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) NewControlPlaneJoinToken(ctx context.Context, name string) (string, error) NewWorkerJoinToken(ctx context.Context) (string, error) RemoveMachineFromCluster(ctx context.Context, machine *clusterv1.Machine) error - - // NOTE(neoaggelos): See notes in (*CK8sControlPlaneReconciler).reconcileEtcdMembers - // - // TODO(neoaggelos): Replace with operations that use the k8sd proxy with things we need. For example, the function to remove a node _could_ be: - // - // RemoveMachineFromCluster(ctx context.Context, machine *clusterv1.Machine) - // - // Then, the implementation of WorkloadCluster should handle everything (reaching to k8sd, calling the right endpoints, authenticating, etc) - // internally. - /** - // Etcd tasks - RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) (bool, error) - ForwardEtcdLeadership(ctx context.Context, machine *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error - ReconcileEtcdMembers(ctx context.Context, nodeNames []string) ([]string, error) - **/ - - // AllowBootstrapTokensToGetNodes(ctx context.Context) error } // Workload defines operations on workload clusters. @@ -75,13 +52,6 @@ type Workload struct { ClientRestConfig *rest.Config K8sdClientGenerator *k8sdClientGenerator microclusterPort int - - // NOTE(neoaggelos): CoreDNSMigrator and etcdClientGenerator are used by upstream to reach and manage the services in the workload cluster - // TODO(neoaggelos): Replace them with a k8sdProxyClientGenerator. - /** - CoreDNSMigrator coreDNSMigrator - etcdClientGenerator etcdClientFor - **/ } // ClusterStatus holds stats information about the cluster. @@ -99,9 +69,6 @@ func (w *Workload) getControlPlaneNodes(ctx context.Context) (*corev1.NodeList, labels := map[string]string{ // NOTE(neoaggelos): Canonical Kubernetes uses node-role.kubernetes.io/control-plane="" as a label for control plane nodes. labelNodeRoleControlPlane: "", - /** - labelNodeRoleControlPlane: "true", - **/ } if err := w.Client.List(ctx, nodes, ctrlclient.MatchingLabels(labels)); err != nil { return nil, err @@ -636,265 +603,4 @@ func aggregateFromMachinesToKCP(input aggregateFromMachinesToKCPInput) { // So there will be no condition at KCP level too. } -// UpdateEtcdConditions is responsible for updating machine conditions reflecting the status of all the etcd members. -// This operation is best effort, in the sense that in case of problems in retrieving member status, it sets -// the condition to Unknown state without returning any error. -func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { - w.updateManagedEtcdConditions(ctx, controlPlane) -} - -func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { - // NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd - // as ultimate source of truth for the list of members and for their health. - controlPlaneNodes, err := w.getControlPlaneNodes(ctx) - if err != nil { - conditions.MarkUnknown(controlPlane.KCP, controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list nodes which are hosting the etcd members") - for _, m := range controlPlane.Machines { - conditions.MarkUnknown(m, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the node which is hosting the etcd member") - } - return - } - - // NOTE(neoaggelos): Upstream queries the etcd cluster endpoint on each of the machine nodes. It verifies that the list of etcd peers retrieved by - // every node in the cluster matches with other nodes, and also verifies that they report the same etcd cluster ID. - // - // In the case of k8s-dqlite, we should do similar steps against the k8s-dqlite cluster. Until that is implemented, we skip this check and assume - // that the node's datastore is in healthy condition if there are matching clusterv1.Machine and corev1.Node objects. - // - // TODO(neoaggelos): Implement API endpoints in k8sd to reach the local k8s-dqlite node and report the known cluster members. Then, verify that the - // list of members matches across all the nodes. - - // Update conditions for etcd members on the nodes. - var ( - // kcpErrors is used to store errors that can't be reported on any machine. - kcpErrors []string - /** - // clusterID is used to store and compare the etcd's cluster id. - clusterID *uint64 - // members is used to store the list of etcd members and compare with all the other nodes in the cluster. - members []*etcd.Member - **/ - ) - - for _, node := range controlPlaneNodes.Items { - // Search for the machine corresponding to the node. - var machine *clusterv1.Machine - for _, m := range controlPlane.Machines { - if m.Status.NodeRef != nil && m.Status.NodeRef.Name == node.Name { - machine = m - } - } - - if machine == nil { - // If there are machines still provisioning there is the chance that a chance that a node might be linked to a machine soon, - // otherwise report the error at KCP level given that there is no machine to report on. - if hasProvisioningMachine(controlPlane.Machines) { - continue - } - kcpErrors = append(kcpErrors, fmt.Sprintf("Control plane node %s does not have a corresponding machine", node.Name)) - continue - } - - // If the machine is deleting, report all the conditions as deleting - if !machine.ObjectMeta.DeletionTimestamp.IsZero() { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "") - continue - } - - /** - currentMembers, err := w.getCurrentEtcdMembers(ctx, machine, node.Name) - if err != nil { - continue - } - - // Check if the list of members IDs reported is the same as all other members. - // NOTE: the first member reporting this information is the baseline for this information. - if members == nil { - members = currentMembers - } - if !etcdutil.MemberEqual(members, currentMembers) { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member reports the cluster is composed by members %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(members)) - continue - } - - // Retrieve the member and check for alarms. - // NB. The member for this node always exists given forFirstAvailableNode(node) used above - member := etcdutil.MemberForName(currentMembers, node.Name) - if member == nil { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member reports the cluster is composed by members %s, but the member itself (%s) is not included", etcdutil.MemberNames(currentMembers), node.Name) - continue - } - if len(member.Alarms) > 0 { - alarmList := []string{} - for _, alarm := range member.Alarms { - switch alarm { - case etcd.AlarmOK: - continue - default: - alarmList = append(alarmList, etcd.AlarmTypeName[alarm]) - } - } - if len(alarmList) > 0 { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports alarms: %s", strings.Join(alarmList, ", ")) - continue - } - } - - // Check if the member belongs to the same cluster as all other members. - // NOTE: the first member reporting this information is the baseline for this information. - if clusterID == nil { - clusterID = &member.ClusterID - } - if *clusterID != member.ClusterID { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member has cluster ID %d, but all previously seen etcd members have cluster ID %d", member.ClusterID, *clusterID) - continue - } - **/ - - conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) - } - - /** - // Make sure that the list of etcd members and machines is consistent. - kcpErrors = compareMachinesAndMembers(controlPlane, members, kcpErrors) - **/ - - // Aggregate components error from machines at KCP level - aggregateFromMachinesToKCP(aggregateFromMachinesToKCPInput{ - controlPlane: controlPlane, - machineConditions: []clusterv1.ConditionType{controlplanev1.MachineEtcdMemberHealthyCondition}, - kcpErrors: kcpErrors, - condition: controlplanev1.EtcdClusterHealthyCondition, - unhealthyReason: controlplanev1.EtcdClusterUnhealthyReason, - unknownReason: controlplanev1.EtcdClusterUnknownReason, - note: "etcd member", - }) -} - -//nolint:godot -/** -func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) { - // Create the etcd Client for the etcd Pod scheduled on the Node - etcdClient, err := w.etcdClientGenerator.forFirstAvailableNode(ctx, []string{nodeName}) - if err != nil { - conditions.MarkUnknown(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to connect to the etcd pod on the %s node: %s", nodeName, err) - return nil, errors.Wrapf(err, "failed to get current etcd members: failed to connect to the etcd pod on the %s node", nodeName) - } - defer etcdClient.Close() - - // While creating a new client, forFirstAvailableNode retrieves the status for the endpoint; check if the endpoint has errors. - if len(etcdClient.Errors) > 0 { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", ")) - return nil, errors.Errorf("failed to get current etcd members: etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", ")) - } - - // Gets the list etcd members known by this member. - currentMembers, err := etcdClient.Members(ctx) - if err != nil { - // NB. We should never be in here, given that we just received answer to the etcd calls included in forFirstAvailableNode; - // however, we are considering the calls to Members a signal of etcd not being stable. - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed get answer from the etcd member on the %s node", nodeName) - return nil, errors.Errorf("failed to get current etcd members: failed get answer from the etcd member on the %s node", nodeName) - } - - return currentMembers, nil -} - -func compareMachinesAndMembers(controlPlane *ControlPlane, members []*etcd.Member, kcpErrors []string) []string { - // NOTE: We run this check only if we actually know the list of members, otherwise the first for loop - // could generate a false negative when reporting missing etcd members. - if members == nil { - return kcpErrors - } - - // Check Machine -> Etcd member. - for _, machine := range controlPlane.Machines { - if machine.Status.NodeRef == nil { - continue - } - found := false - for _, member := range members { - nodeNameFromMember := etcdutil.NodeNameFromMember(member) - if machine.Status.NodeRef.Name == nodeNameFromMember { - found = true - break - } - } - if !found { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Missing etcd member") - } - } - - // Check Etcd member -> Machine. - for _, member := range members { - found := false - nodeNameFromMember := etcdutil.NodeNameFromMember(member) - for _, machine := range controlPlane.Machines { - if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == nodeNameFromMember { - found = true - break - } - } - if !found { - name := nodeNameFromMember - if name == "" { - name = fmt.Sprintf("%d (Name not yet assigned)", member.ID) - } - kcpErrors = append(kcpErrors, fmt.Sprintf("etcd member %s does not have a corresponding machine", name)) - } - } - return kcpErrors -} - -func generateClientCert(caCertEncoded, caKeyEncoded []byte) (tls.Certificate, error) { - // TODO: need to cache clientkey to clusterCacheTracker to avoid recreating key frequently - clientKey, err := certs.NewPrivateKey() - if err != nil { - return tls.Certificate{}, errors.Wrapf(err, "error creating client key") - } - - caCert, err := certs.DecodeCertPEM(caCertEncoded) - if err != nil { - return tls.Certificate{}, err - } - caKey, err := certs.DecodePrivateKeyPEM(caKeyEncoded) - if err != nil { - return tls.Certificate{}, err - } - x509Cert, err := newClientCert(caCert, clientKey, caKey) - if err != nil { - return tls.Certificate{}, err - } - return tls.X509KeyPair(certs.EncodeCertPEM(x509Cert), certs.EncodePrivateKeyPEM(clientKey)) -} - -func newClientCert(caCert *x509.Certificate, key *rsa.PrivateKey, caKey crypto.Signer) (*x509.Certificate, error) { - cfg := certs.Config{ - CommonName: "cluster-api.x-k8s.io", - } - - now := time.Now().UTC() - - tmpl := x509.Certificate{ - SerialNumber: new(big.Int).SetInt64(0), - Subject: pkix.Name{ - CommonName: cfg.CommonName, - Organization: cfg.Organization, - }, - NotBefore: now.Add(time.Minute * -5), - NotAfter: now.Add(time.Hour * 24 * 365 * 10), // 10 years - KeyUsage: x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth}, - } - - b, err := x509.CreateCertificate(rand.Reader, &tmpl, caCert, key.Public(), caKey) - if err != nil { - return nil, errors.Wrapf(err, "failed to create signed client certificate: %+v", tmpl) - } - - c, err := x509.ParseCertificate(b) - return c, errors.WithStack(err) -} -**/ - var _ WorkloadCluster = &Workload{} diff --git a/pkg/ck8s/workload_cluster_k8sd.go b/pkg/ck8s/workload_cluster_k8sd.go index 8b0b312d..d475a8da 100644 --- a/pkg/ck8s/workload_cluster_k8sd.go +++ b/pkg/ck8s/workload_cluster_k8sd.go @@ -4,11 +4,11 @@ import ( "context" "crypto/tls" _ "embed" + "errors" "fmt" "net/http" "time" - "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -71,7 +71,7 @@ func (g *k8sdClientGenerator) forNode(ctx context.Context, node *corev1.Node) (* func (g *k8sdClientGenerator) getProxyPods(ctx context.Context) (map[string]string, error) { pods, err := g.clientset.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{LabelSelector: "app=k8sd-proxy"}) if err != nil { - return nil, errors.Wrap(err, "unable to list k8sd-proxy pods in target cluster") + return nil, fmt.Errorf("unable to list k8sd-proxy pods in target cluster: %w", err) } if len(pods.Items) == 0 { diff --git a/pkg/locking/control_plane_init_mutex.go b/pkg/locking/control_plane_init_mutex.go index e5a1e24f..7bc607ca 100644 --- a/pkg/locking/control_plane_init_mutex.go +++ b/pkg/locking/control_plane_init_mutex.go @@ -22,7 +22,6 @@ import ( "encoding/json" "fmt" - "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -156,7 +155,7 @@ func configMapName(clusterName string) string { func (s semaphore) information() (*information, error) { li := &information{} if err := json.Unmarshal([]byte(s.Data[semaphoreInformationKey]), li); err != nil { - return nil, errors.Wrap(err, "failed to unmarshal semaphore information") + return nil, fmt.Errorf("failed to unmarshal semaphore information: %w", err) } return li, nil } @@ -164,7 +163,7 @@ func (s semaphore) information() (*information, error) { func (s semaphore) setInformation(information *information) error { b, err := json.Marshal(information) if err != nil { - return errors.Wrap(err, "failed to marshal semaphore information") + return fmt.Errorf("failed to marshal semaphore information: %w", err) } s.Data = map[string]string{} s.Data[semaphoreInformationKey] = string(b) diff --git a/pkg/proxy/dial.go b/pkg/proxy/dial.go index d3d80729..3d78b877 100644 --- a/pkg/proxy/dial.go +++ b/pkg/proxy/dial.go @@ -18,12 +18,12 @@ package proxy import ( "context" + "errors" "fmt" "net" "net/http" "time" - "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/kubernetes" @@ -99,7 +99,7 @@ func (d *Dialer) DialContext(_ context.Context, _ string, addr string) (net.Conn // Warning: Any early return should close this connection, otherwise we're going to leak them. connection, _, err := dialer.Dial(portforward.PortForwardProtocolV1Name) if err != nil { - return nil, errors.Wrap(err, "error upgrading connection") + return nil, fmt.Errorf("error upgrading connection: %w", err) } // Create the headers. @@ -136,7 +136,7 @@ func (d *Dialer) DialContext(_ context.Context, _ string, addr string) (net.Conn dataStream, err := connection.CreateStream(headers) if err != nil { return nil, kerrors.NewAggregate([]error{ - errors.Wrap(err, "error creating forwarding stream"), + fmt.Errorf("error creating forwarding stream: %w", err), connection.Close(), }) } diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go index f3724784..42ad619a 100644 --- a/test/e2e/helpers.go +++ b/test/e2e/helpers.go @@ -26,7 +26,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/pkg/errors" "golang.org/x/mod/semver" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -419,7 +418,7 @@ func WaitForControlPlaneToBeReady(ctx context.Context, input WaitForControlPlane } Byf("Getting the control plane %s", klog.KObj(input.ControlPlane)) if err := input.Getter.Get(ctx, key, controlplane); err != nil { - return false, errors.Wrapf(err, "failed to get KCP") + return false, fmt.Errorf("failed to get KCP: %w", err) } desiredReplicas := controlplane.Spec.Replicas