agent: maintain NetworkUnavailable=False on owned nodes
Build flock Image / build (push) Has been cancelled
Build flock Image / build (push) Has been cancelled
When Calico shuts down on a flock-labeled node, calico-node sets NetworkUnavailable=True with reason CalicoIsDown. Nothing replaces it, so kubelet's NodeController applies node.kubernetes.io/network- unavailable:NoSchedule and new pods can't land. flock-agent now patches Status.Conditions every 60s with NetworkUnavailable=False (reason=FlockReady). RBAC: nodes/status patch. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -91,6 +91,9 @@ rules:
|
|||||||
- apiGroups: ["networking.k8s.io"]
|
- apiGroups: ["networking.k8s.io"]
|
||||||
resources: ["networkpolicies"]
|
resources: ["networkpolicies"]
|
||||||
verbs: ["get", "list", "watch"]
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes/status"]
|
||||||
|
verbs: ["patch"]
|
||||||
---
|
---
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
kind: ClusterRoleBinding
|
kind: ClusterRoleBinding
|
||||||
|
|||||||
@@ -18,6 +18,9 @@ rules:
|
|||||||
- apiGroups: ["networking.k8s.io"]
|
- apiGroups: ["networking.k8s.io"]
|
||||||
resources: ["networkpolicies"]
|
resources: ["networkpolicies"]
|
||||||
verbs: ["get", "list", "watch"]
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes/status"]
|
||||||
|
verbs: ["patch"]
|
||||||
---
|
---
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
kind: ClusterRoleBinding
|
kind: ClusterRoleBinding
|
||||||
|
|||||||
@@ -0,0 +1,66 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/types"
|
||||||
|
"k8s.io/client-go/kubernetes"
|
||||||
|
"k8s.io/client-go/rest"
|
||||||
|
)
|
||||||
|
|
||||||
|
// keepNetworkAvailable maintains a NetworkUnavailable=False condition on
|
||||||
|
// the node's status. Calico-node sets this False while it owns CNI; on
|
||||||
|
// shutdown it sets it to True with reason CalicoIsDown, which adds the
|
||||||
|
// node.kubernetes.io/network-unavailable taint and blocks new scheduling.
|
||||||
|
// Once flock-agent is in charge, we own the condition.
|
||||||
|
//
|
||||||
|
// Re-applies every minute — heartbeat-style — so a stale condition from a
|
||||||
|
// previous CNI is overwritten without an explicit transition.
|
||||||
|
func keepNetworkAvailable(ctx context.Context, cfg *rest.Config, node string, logger *slog.Logger) {
|
||||||
|
cs, err := kubernetes.NewForConfig(cfg)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warn("network-condition: kubernetes client", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
apply := func() {
|
||||||
|
now := metav1.Now()
|
||||||
|
patch := map[string]interface{}{
|
||||||
|
"status": map[string]interface{}{
|
||||||
|
"conditions": []corev1.NodeCondition{{
|
||||||
|
Type: corev1.NodeNetworkUnavailable,
|
||||||
|
Status: corev1.ConditionFalse,
|
||||||
|
Reason: "FlockReady",
|
||||||
|
Message: "flock-agent owns CNI on this node",
|
||||||
|
LastHeartbeatTime: now,
|
||||||
|
LastTransitionTime: now,
|
||||||
|
}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
body, _ := json.Marshal(patch)
|
||||||
|
_, err := cs.CoreV1().Nodes().Patch(ctx, node, types.MergePatchType, body, metav1.PatchOptions{}, "status")
|
||||||
|
if err != nil {
|
||||||
|
logger.Warn("network-condition: patch failed", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
apply()
|
||||||
|
t := time.NewTicker(60 * time.Second)
|
||||||
|
defer t.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-t.C:
|
||||||
|
apply()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// silence unused-import warnings on non-Linux builds where this is unused.
|
||||||
|
var _ = fmt.Sprintf
|
||||||
@@ -53,6 +53,12 @@ func (s *Server) configureRuntime(ctx context.Context) error {
|
|||||||
return fmt.Errorf("pod informer: %w", err)
|
return fmt.Errorf("pod informer: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Keep NetworkUnavailable=False so the node.kubernetes.io/network-
|
||||||
|
// unavailable taint never gets re-applied. Calico's calico-node sets
|
||||||
|
// it on shutdown; without an owner replacing it, kubelet's controller
|
||||||
|
// taints the node and blocks scheduling.
|
||||||
|
go keepNetworkAvailable(ctx, s.restCfg, s.Node, s.Logger)
|
||||||
|
|
||||||
bird := &BirdManager{
|
bird := &BirdManager{
|
||||||
NodeName: s.Node,
|
NodeName: s.Node,
|
||||||
ConfigPath: "/etc/flock/bird/bird.conf",
|
ConfigPath: "/etc/flock/bird/bird.conf",
|
||||||
|
|||||||
Reference in New Issue
Block a user