e00579f7ca
Build flock Image / build (push) Has been cancelled
The previous implementation used JSON merge-patch (types.MergePatchType) with a one-element conditions array. JSON merge-patch on arrays is whole-array replacement, so every 60s flock-agent stomped over the kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure), leaving only NetworkUnavailable on the node — until kubelet's next status post (~5s later) re-set them. Symptom: `kubectl get nodes` flickered, with one node briefly showing Unknown each polling tick. k9s lit up red on rotating nodes. (kube- controller-manager is also a write contender and was correctly noted in the field-managers list.) Switch to Server-Side Apply against the status subresource with fieldManager=flock-agent and Force=true. NodeStatus.Conditions is a listType=map keyed by `type`, so SSA merges by type — we declare ownership of only the NetworkUnavailable entry and leave kubelet's entries untouched. Force lets us reclaim the condition if a previous CNI manager (e.g. calico-node finalizer leftovers) still owns it. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
82 lines
2.8 KiB
Go
82 lines
2.8 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/rest"
|
|
)
|
|
|
|
// fieldManager identifies flock-agent in apiserver field-manager bookkeeping.
|
|
// Server-Side Apply only takes ownership of the fields we send, so other
|
|
// managers (kubelet, kcm) keep their conditions untouched between our writes.
|
|
const nodeStatusFieldManager = "flock-agent"
|
|
|
|
// keepNetworkAvailable maintains a NetworkUnavailable=False condition on
|
|
// the node's status. Calico-node sets this False while it owns CNI; on
|
|
// shutdown it sets it to True with reason CalicoIsDown, which adds the
|
|
// node.kubernetes.io/network-unavailable taint and blocks new scheduling.
|
|
// Once flock-agent is in charge, we own that single condition.
|
|
//
|
|
// Uses Server-Side Apply against the status subresource. NodeStatus.Conditions
|
|
// is a listType=map keyed by `type`, so SSA merges by type — our partial body
|
|
// declares ownership of just the NetworkUnavailable entry and leaves the
|
|
// kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
|
|
// alone. A prior implementation used JSON merge-patch with a one-element
|
|
// conditions array, which the apiserver REPLACES (merge-patch on arrays is
|
|
// whole-array semantics) — that race-stripped the kubelet conditions every
|
|
// 60s and produced ~5s flickers in `kubectl get nodes`.
|
|
//
|
|
// Re-applies every minute (heartbeat-style) so a stale condition from a
|
|
// previous CNI is overwritten without an explicit transition.
|
|
func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) {
|
|
cs, err := kubernetes.NewForConfig(cfg)
|
|
if err != nil {
|
|
logger.Warn("network-condition: kubernetes client", "err", err)
|
|
return
|
|
}
|
|
apply := func() {
|
|
now := metav1.Now().UTC().Format(time.RFC3339)
|
|
// Hand-build the SSA body so we only declare the fields we own.
|
|
// Force=true lets us reclaim the condition if a previous CNI's
|
|
// finalizer/cleanup left it owned by a different manager.
|
|
body := []byte(fmt.Sprintf(`{
|
|
"apiVersion": "v1",
|
|
"kind": "Node",
|
|
"metadata": {"name": %q},
|
|
"status": {"conditions": [{
|
|
"type": "NetworkUnavailable",
|
|
"status": "False",
|
|
"reason": "FlockReady",
|
|
"message": "flock-agent owns CNI on this node",
|
|
"lastHeartbeatTime": %q,
|
|
"lastTransitionTime": %q
|
|
}]}
|
|
}`, node, now, now))
|
|
force := true
|
|
_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.ApplyPatchType, body,
|
|
metav1.PatchOptions{FieldManager: nodeStatusFieldManager, Force: &force},
|
|
"status")
|
|
if err != nil {
|
|
logger.Warn("network-condition: ssa apply failed", "err", err)
|
|
return
|
|
}
|
|
}
|
|
apply()
|
|
t := time.NewTicker(60 * time.Second)
|
|
defer t.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-t.C:
|
|
apply()
|
|
}
|
|
}
|
|
}
|