From c7fb1596329b65e4afdf3051d4e7266b67c66b79 Mon Sep 17 00:00:00 2001 From: Donavan Fritz Date: Fri, 24 Apr 2026 23:11:47 -0500 Subject: [PATCH] agent: maintain NetworkUnavailable=False on owned nodes When Calico shuts down on a flock-labeled node, calico-node sets NetworkUnavailable=True with reason CalicoIsDown. Nothing replaces it, so kubelet's NodeController applies node.kubernetes.io/network- unavailable:NoSchedule and new pods can't land. flock-agent now patches Status.Conditions every 60s with NetworkUnavailable=False (reason=FlockReady). RBAC: nodes/status patch. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- deploy/install.yaml | 3 ++ deploy/rbac/serviceaccount.yaml | 3 ++ pkg/agent/nodecondition.go | 66 +++++++++++++++++++++++++++++++++ pkg/agent/runtime_linux.go | 6 +++ 4 files changed, 78 insertions(+) create mode 100644 pkg/agent/nodecondition.go diff --git a/deploy/install.yaml b/deploy/install.yaml index 5baf640..7fc587a 100644 --- a/deploy/install.yaml +++ b/deploy/install.yaml @@ -91,6 +91,9 @@ rules: - apiGroups: ["networking.k8s.io"] resources: ["networkpolicies"] verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes/status"] + verbs: ["patch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/deploy/rbac/serviceaccount.yaml b/deploy/rbac/serviceaccount.yaml index d317e63..594ac4d 100644 --- a/deploy/rbac/serviceaccount.yaml +++ b/deploy/rbac/serviceaccount.yaml @@ -18,6 +18,9 @@ rules: - apiGroups: ["networking.k8s.io"] resources: ["networkpolicies"] verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes/status"] + verbs: ["patch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/pkg/agent/nodecondition.go b/pkg/agent/nodecondition.go new file mode 100644 index 0000000..af06471 --- /dev/null +++ b/pkg/agent/nodecondition.go @@ -0,0 +1,66 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// keepNetworkAvailable maintains a NetworkUnavailable=False condition on +// the node's status. Calico-node sets this False while it owns CNI; on +// shutdown it sets it to True with reason CalicoIsDown, which adds the +// node.kubernetes.io/network-unavailable taint and blocks new scheduling. +// Once flock-agent is in charge, we own the condition. +// +// Re-applies every minute — heartbeat-style — so a stale condition from a +// previous CNI is overwritten without an explicit transition. +func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) { + cs, err := kubernetes.NewForConfig(cfg) + if err != nil { + logger.Warn("network-condition: kubernetes client", "err", err) + return + } + apply := func() { + now := metav1.Now() + patch := map[string]interface{}{ + "status": map[string]interface{}{ + "conditions": []corev1.NodeCondition{{ + Type: corev1.NodeNetworkUnavailable, + Status: corev1.ConditionFalse, + Reason: "FlockReady", + Message: "flock-agent owns CNI on this node", + LastHeartbeatTime: now, + LastTransitionTime: now, + }}, + }, + } + body, _ := json.Marshal(patch) + _, err := cs.CoreV1().Nodes().Patch(ctx, node, types.MergePatchType, body, metav1.PatchOptions{}, "status") + if err != nil { + logger.Warn("network-condition: patch failed", "err", err) + return + } + } + apply() + t := time.NewTicker(60 * time.Second) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + apply() + } + } +} + +// silence unused-import warnings on non-Linux builds where this is unused. +var _ = fmt.Sprintf diff --git a/pkg/agent/runtime_linux.go b/pkg/agent/runtime_linux.go index c507e38..5954660 100644 --- a/pkg/agent/runtime_linux.go +++ b/pkg/agent/runtime_linux.go @@ -53,6 +53,12 @@ func (s *Server) configureRuntime(ctx context.Context) error { return fmt.Errorf("pod informer: %w", err) } + // Keep NetworkUnavailable=False so the node.kubernetes.io/network- + // unavailable taint never gets re-applied. Calico's calico-node sets + // it on shutdown; without an owner replacing it, kubelet's controller + // taints the node and blocks scheduling. + go keepNetworkAvailable(ctx, s.restCfg, s.Node, s.Logger) + bird := &BirdManager{ NodeName: s.Node, ConfigPath: "/etc/flock/bird/bird.conf",