agent: maintain NetworkUnavailable=False on owned nodes
Build flock Image / build (push) Has been cancelled

When Calico shuts down on a flock-labeled node, calico-node sets
NetworkUnavailable=True with reason CalicoIsDown. Nothing replaces it,
so kubelet's NodeController applies node.kubernetes.io/network-
unavailable:NoSchedule and new pods can't land.

flock-agent now patches Status.Conditions every 60s with
NetworkUnavailable=False (reason=FlockReady). RBAC: nodes/status patch.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Donavan Fritz
2026-04-24 23:11:47 -05:00
parent a1222f13cc
commit c7fb159632
4 changed files with 78 additions and 0 deletions
+66
View File
@@ -0,0 +1,66 @@
package agent
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
// keepNetworkAvailable maintains a NetworkUnavailable=False condition on
// the node's status. Calico-node sets this False while it owns CNI; on
// shutdown it sets it to True with reason CalicoIsDown, which adds the
// node.kubernetes.io/network-unavailable taint and blocks new scheduling.
// Once flock-agent is in charge, we own the condition.
//
// Re-applies every minute — heartbeat-style — so a stale condition from a
// previous CNI is overwritten without an explicit transition.
func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) {
cs, err := kubernetes.NewForConfig(cfg)
if err != nil {
logger.Warn("network-condition: kubernetes client", "err", err)
return
}
apply := func() {
now := metav1.Now()
patch := map[string]interface{}{
"status": map[string]interface{}{
"conditions": []corev1.NodeCondition{{
Type: corev1.NodeNetworkUnavailable,
Status: corev1.ConditionFalse,
Reason: "FlockReady",
Message: "flock-agent owns CNI on this node",
LastHeartbeatTime: now,
LastTransitionTime: now,
}},
},
}
body, _ := json.Marshal(patch)
_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.MergePatchType, body, metav1.PatchOptions{}, "status")
if err != nil {
logger.Warn("network-condition: patch failed", "err", err)
return
}
}
apply()
t := time.NewTicker(60 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
apply()
}
}
}
// silence unused-import warnings on non-Linux builds where this is unused.
var _ = fmt.Sprintf
+6
View File
@@ -53,6 +53,12 @@ func (s *Server) configureRuntime(ctx context.Context) error {
return fmt.Errorf("pod informer: %w", err)
}
// Keep NetworkUnavailable=False so the node.kubernetes.io/network-
// unavailable taint never gets re-applied. Calico's calico-node sets
// it on shutdown; without an owner replacing it, kubelet's controller
// taints the node and blocks scheduling.
go keepNetworkAvailable(ctx, s.restCfg, s.Node, s.Logger)
bird := &BirdManager{
NodeName: s.Node,
ConfigPath: "/etc/flock/bird/bird.conf",