nodecondition: SSA the NetworkUnavailable condition (don't merge-patch)

The previous implementation used JSON merge-patch (types.MergePatchType) with a one-element conditions array. JSON merge-patch on arrays is whole-array replacement, so every 60s flock-agent stomped over the kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure), leaving only NetworkUnavailable on the node — until kubelet's next status post (~5s later) re-set them. Symptom: `kubectl get nodes` flickered, with one node briefly showing Unknown each polling tick. k9s lit up red on rotating nodes. (kube- controller-manager is also a write contender and was correctly noted in the field-managers list.) Switch to Server-Side Apply against the status subresource with fieldManager=flock-agent and Force=true. NodeStatus.Conditions is a listType=map keyed by `type`, so SSA merges by type — we declare ownership of only the NetworkUnavailable entry and leave kubelet's entries untouched. Force lets us reclaim the condition if a previous CNI manager (e.g. calico-node finalizer leftovers) still owns it. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-26 08:55:03 -05:00
parent a6a50fd73f
commit e00579f7ca
1 changed files with 38 additions and 23 deletions
@@ -2,25 +2,37 @@ package agent

 import (
 	"context"
-	"encoding/json"
 	"fmt"
 	"log/slog"
 	"time"

-	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 )

+// fieldManager identifies flock-agent in apiserver field-manager bookkeeping.
+// Server-Side Apply only takes ownership of the fields we send, so other
+// managers (kubelet, kcm) keep their conditions untouched between our writes.
+const nodeStatusFieldManager = "flock-agent"
+
 // keepNetworkAvailable maintains a NetworkUnavailable=False condition on
 // the node's status. Calico-node sets this False while it owns CNI; on
 // shutdown it sets it to True with reason CalicoIsDown, which adds the
 // node.kubernetes.io/network-unavailable taint and blocks new scheduling.
-// Once flock-agent is in charge, we own the condition.
+// Once flock-agent is in charge, we own that single condition.
 //
-// Re-applies every minute — heartbeat-style — so a stale condition from a
+// Uses Server-Side Apply against the status subresource. NodeStatus.Conditions
+// is a listType=map keyed by `type`, so SSA merges by type — our partial body
+// declares ownership of just the NetworkUnavailable entry and leaves the
+// kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
+// alone. A prior implementation used JSON merge-patch with a one-element
+// conditions array, which the apiserver REPLACES (merge-patch on arrays is
+// whole-array semantics) — that race-stripped the kubelet conditions every
+// 60s and produced ~5s flickers in `kubectl get nodes`.
+//
+// Re-applies every minute (heartbeat-style) so a stale condition from a
 // previous CNI is overwritten without an explicit transition.
 func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) {
 	cs, err := kubernetes.NewForConfig(cfg)
@@ -29,23 +41,29 @@ func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, lo
 		return
 	}
 	apply := func() {
-		now := metav1.Now()
-		patch := map[string]interface{}{
-			"status": map[string]interface{}{
-				"conditions": []corev1.NodeCondition{{
-					Type:               corev1.NodeNetworkUnavailable,
-					Status:             corev1.ConditionFalse,
-					Reason:             "FlockReady",
-					Message:            "flock-agent owns CNI on this node",
-					LastHeartbeatTime:  now,
-					LastTransitionTime: now,
-				}},
-			},
-		}
-		body, _ := json.Marshal(patch)
-		_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.MergePatchType, body, metav1.PatchOptions{}, "status")
+		now := metav1.Now().UTC().Format(time.RFC3339)
+		// Hand-build the SSA body so we only declare the fields we own.
+		// Force=true lets us reclaim the condition if a previous CNI's
+		// finalizer/cleanup left it owned by a different manager.
+		body := []byte(fmt.Sprintf(`{
+  "apiVersion": "v1",
+  "kind": "Node",
+  "metadata": {"name": %q},
+  "status": {"conditions": [{
+    "type": "NetworkUnavailable",
+    "status": "False",
+    "reason": "FlockReady",
+    "message": "flock-agent owns CNI on this node",
+    "lastHeartbeatTime": %q,
+    "lastTransitionTime": %q
+  }]}
+}`, node, now, now))
+		force := true
+		_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.ApplyPatchType, body,
+			metav1.PatchOptions{FieldManager: nodeStatusFieldManager, Force: &force},
+			"status")
 		if err != nil {
-			logger.Warn("network-condition: patch failed", "err", err)
+			logger.Warn("network-condition: ssa apply failed", "err", err)
 			return
 		}
 	}
@@ -61,6 +79,3 @@ func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, lo
 		}
 	}
 }
-
-// silence unused-import warnings on non-Linux builds where this is unused.
-var _ = fmt.Sprintf