Files

82 lines
2.8 KiB
Go
Raw Permalink Normal View History

package agent
import (
"context"
"fmt"
"log/slog"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
// fieldManager identifies flock-agent in apiserver field-manager bookkeeping.
// Server-Side Apply only takes ownership of the fields we send, so other
// managers (kubelet, kcm) keep their conditions untouched between our writes.
const nodeStatusFieldManager = "flock-agent"
// keepNetworkAvailable maintains a NetworkUnavailable=False condition on
// the node's status. Calico-node sets this False while it owns CNI; on
// shutdown it sets it to True with reason CalicoIsDown, which adds the
// node.kubernetes.io/network-unavailable taint and blocks new scheduling.
// Once flock-agent is in charge, we own that single condition.
//
// Uses Server-Side Apply against the status subresource. NodeStatus.Conditions
// is a listType=map keyed by `type`, so SSA merges by type — our partial body
// declares ownership of just the NetworkUnavailable entry and leaves the
// kubelet-managed conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
// alone. A prior implementation used JSON merge-patch with a one-element
// conditions array, which the apiserver REPLACES (merge-patch on arrays is
// whole-array semantics) — that race-stripped the kubelet conditions every
// 60s and produced ~5s flickers in `kubectl get nodes`.
//
// Re-applies every minute (heartbeat-style) so a stale condition from a
// previous CNI is overwritten without an explicit transition.
func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) {
cs, err := kubernetes.NewForConfig(cfg)
if err != nil {
logger.Warn("network-condition: kubernetes client", "err", err)
return
}
apply := func() {
now := metav1.Now().UTC().Format(time.RFC3339)
// Hand-build the SSA body so we only declare the fields we own.
// Force=true lets us reclaim the condition if a previous CNI's
// finalizer/cleanup left it owned by a different manager.
body := []byte(fmt.Sprintf(`{
"apiVersion": "v1",
"kind": "Node",
"metadata": {"name": %q},
"status": {"conditions": [{
"type": "NetworkUnavailable",
"status": "False",
"reason": "FlockReady",
"message": "flock-agent owns CNI on this node",
"lastHeartbeatTime": %q,
"lastTransitionTime": %q
}]}
}`, node, now, now))
force := true
_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.ApplyPatchType, body,
metav1.PatchOptions{FieldManager: nodeStatusFieldManager, Force: &force},
"status")
if err != nil {
logger.Warn("network-condition: ssa apply failed", "err", err)
return
}
}
apply()
t := time.NewTicker(60 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
apply()
}
}
}