nodecondition: SSA the NetworkUnavailable condition (don't merge-patch)

The previous implementation used JSON merge-patch (types.MergePatchType) with a one-element conditions array. JSON merge-patch on arrays is whole-array replacement, so every 60s flock-agent stomped over the kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure), leaving only NetworkUnavailable on the node — until kubelet's next status post (~5s later) re-set them. Symptom: `kubectl get nodes` flickered, with one node briefly showing Unknown each polling tick. k9s lit up red on rotating nodes. (kube- controller-manager is also a write contender and was correctly noted in the field-managers list.) Switch to Server-Side Apply against the status subresource with fieldManager=flock-agent and Force=true. NodeStatus.Conditions is a listType=map keyed by `type`, so SSA merges by type — we declare ownership of only the NetworkUnavailable entry and leave kubelet's entries untouched. Force lets us reclaim the condition if a previous CNI manager (e.g. calico-node finalizer leftovers) still owns it. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-26 08:55:03 -05:00
parent a6a50fd73f
commit e00579f7ca
1 changed files with 38 additions and 23 deletions
@@ -2,25 +2,37 @@ package agent
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"time"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 )
 // fieldManager identifies flock-agent in apiserver field-manager bookkeeping.
 // Server-Side Apply only takes ownership of the fields we send, so other
 // managers (kubelet, kcm) keep their conditions untouched between our writes.
 const nodeStatusFieldManager = "flock-agent"
 // keepNetworkAvailable maintains a NetworkUnavailable=False condition on
 // the node's status. Calico-node sets this False while it owns CNI; on
 // shutdown it sets it to True with reason CalicoIsDown, which adds the
 // node.kubernetes.io/network-unavailable taint and blocks new scheduling.
-// Once flock-agent is in charge, we own the condition.
+// Once flock-agent is in charge, we own that single condition.
 //
-// Re-applies every minute — heartbeat-style — so a stale condition from a
+// Uses Server-Side Apply against the status subresource. NodeStatus.Conditions
 // is a listType=map keyed by `type`, so SSA merges by type — our partial body
 // declares ownership of just the NetworkUnavailable entry and leaves the
 // kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
 // alone. A prior implementation used JSON merge-patch with a one-element
 // conditions array, which the apiserver REPLACES (merge-patch on arrays is
 // whole-array semantics) — that race-stripped the kubelet conditions every
 // 60s and produced ~5s flickers in `kubectl get nodes`.
 //
 // Re-applies every minute (heartbeat-style) so a stale condition from a
 // previous CNI is overwritten without an explicit transition.
 func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) {
 	cs, err := kubernetes.NewForConfig(cfg)
@@ -29,23 +41,29 @@ func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, lo
 		return
 	}
 	apply := func() {
-		now := metav1.Now()
+		now := metav1.Now().UTC().Format(time.RFC3339)
-		patch := map[string]interface{}{
+		// Hand-build the SSA body so we only declare the fields we own.
-			"status": map[string]interface{}{
+		// Force=true lets us reclaim the condition if a previous CNI's
-				"conditions": []corev1.NodeCondition{{
+		// finalizer/cleanup left it owned by a different manager.
-					Type:               corev1.NodeNetworkUnavailable,
+		body := []byte(fmt.Sprintf(`{
-					Status:             corev1.ConditionFalse,
+  "apiVersion": "v1",
-					Reason:             "FlockReady",
+  "kind": "Node",
-					Message:            "flock-agent owns CNI on this node",
+  "metadata": {"name": %q},
-					LastHeartbeatTime:  now,
+  "status": {"conditions": [{
-					LastTransitionTime: now,
+    "type": "NetworkUnavailable",
-				}},
+    "status": "False",
-			},
+    "reason": "FlockReady",
-		}
+    "message": "flock-agent owns CNI on this node",
-		body, _ := json.Marshal(patch)
+    "lastHeartbeatTime": %q,
-		_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.MergePatchType, body, metav1.PatchOptions{}, "status")
+    "lastTransitionTime": %q
  }]}
 }`, node, now, now))
 		force := true
 		_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.ApplyPatchType, body,
 			metav1.PatchOptions{FieldManager: nodeStatusFieldManager, Force: &force},
 			"status")
 		if err != nil {
-			logger.Warn("network-condition: patch failed", "err", err)
+			logger.Warn("network-condition: ssa apply failed", "err", err)
 			return
 		}
 	}
@@ -61,6 +79,3 @@ func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, lo
 		}
 	}
 }
 // silence unused-import warnings on non-Linux builds where this is unused.
 var _ = fmt.Sprintf