2026-04-24 22:33:48 -05:00
|
|
|
//go:build linux
|
|
|
|
|
|
|
|
|
|
package agent
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"fmt"
|
|
|
|
|
"net"
|
2026-04-25 09:57:32 -05:00
|
|
|
"os"
|
2026-04-24 22:33:48 -05:00
|
|
|
"time"
|
2026-04-25 09:25:58 -05:00
|
|
|
|
|
|
|
|
"code.fritzlab.net/fritzlab/flock/pkg/agent/netpol"
|
2026-04-24 22:33:48 -05:00
|
|
|
)
|
|
|
|
|
|
2026-04-25 09:57:32 -05:00
|
|
|
// hostMultipathHashSysctls is the set of node-level sysctls flock-agent
|
|
|
|
|
// best-effort writes at startup. Default policy 0 hashes only on
|
|
|
|
|
// (saddr, daddr); policy 1 adds L4 (sport, dport, proto), giving real
|
|
|
|
|
// per-connection ECMP across multipath nexthops — required for sensible
|
|
|
|
|
// distribution across multiple anycast pods on the same node.
|
|
|
|
|
var hostMultipathHashSysctls = map[string]string{
|
|
|
|
|
"/proc/sys/net/ipv4/fib_multipath_hash_policy": "1",
|
|
|
|
|
"/proc/sys/net/ipv6/fib_multipath_hash_policy": "1",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// applyHostSysctls writes the sysctls in m, logging but not failing on
|
|
|
|
|
// errors. flock-agent is privileged so this works in the production
|
|
|
|
|
// DaemonSet; in environments where it doesn't, single-pod-per-node
|
|
|
|
|
// anycast still works (this only affects the multi-pod-per-node case).
|
|
|
|
|
func applyHostSysctls(s *Server) {
|
|
|
|
|
for path, value := range hostMultipathHashSysctls {
|
|
|
|
|
if err := os.WriteFile(path, []byte(value), 0o644); err != nil {
|
|
|
|
|
s.Logger.Warn("set host sysctl", "path", path, "value", value, "err", err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
s.Logger.Info("host sysctl set", "path", path, "value", value)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-24 22:33:48 -05:00
|
|
|
// configureRuntime wires Pod informer, IPAM, netlink, and BIRD on a real
|
|
|
|
|
// Linux node. Steps:
|
|
|
|
|
//
|
|
|
|
|
// 1. Wait for NodeConfig (operator-applied per-node CR).
|
|
|
|
|
// 2. Reconcile any pre-existing kernel state from allocations.json into
|
|
|
|
|
// IPAM.used (so we never re-allocate an in-flight pod's IP).
|
|
|
|
|
// 3. Garbage-collect any state==pending entries (partial ADDs from a
|
|
|
|
|
// previous agent generation).
|
|
|
|
|
// 4. Start the Pod informer (filtered to spec.nodeName == node).
|
|
|
|
|
// 5. Build PodHandler and SetHandlers(add, del, check).
|
|
|
|
|
// 6. Install BIRD blackhole summary routes + render initial config.
|
|
|
|
|
func (s *Server) configureRuntime(ctx context.Context) error {
|
2026-04-25 09:57:32 -05:00
|
|
|
applyHostSysctls(s)
|
|
|
|
|
|
2026-04-24 22:33:48 -05:00
|
|
|
if err := s.firstAvailableNodeConfig(ctx, 60*time.Second); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
nc := s.NodeConfig.Load()
|
|
|
|
|
|
|
|
|
|
ipam, err := NewIPAM(nc.Spec.CIDR6, nc.Spec.CIDR4)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("init ipam: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reconcile committed entries; GC pending entries.
|
|
|
|
|
for _, a := range s.Store.Snapshot() {
|
|
|
|
|
switch a.State {
|
|
|
|
|
case StateCommitted:
|
|
|
|
|
if a.IP6 != "" {
|
|
|
|
|
ipam.MarkInUse(net.ParseIP(a.IP6))
|
|
|
|
|
}
|
|
|
|
|
if a.IP4 != "" {
|
|
|
|
|
ipam.MarkInUse(net.ParseIP(a.IP4))
|
|
|
|
|
}
|
|
|
|
|
case StatePending:
|
|
|
|
|
s.Logger.Info("GC pending allocation", "container_id", a.ContainerID)
|
|
|
|
|
_ = Teardown(a.ContainerID, net.ParseIP(a.IP6), net.ParseIP(a.IP4))
|
|
|
|
|
_ = s.Store.Delete(a.ContainerID)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pods, err := StartPodInformer(ctx, s.restCfg, s.Node, s.Logger)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("pod informer: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-24 23:11:47 -05:00
|
|
|
// Keep NetworkUnavailable=False so the node.kubernetes.io/network-
|
|
|
|
|
// unavailable taint never gets re-applied. Calico's calico-node sets
|
|
|
|
|
// it on shutdown; without an owner replacing it, kubelet's controller
|
|
|
|
|
// taints the node and blocks scheduling.
|
|
|
|
|
go keepNetworkAvailable(ctx, s.restCfg, s.Node, s.Logger)
|
|
|
|
|
|
2026-04-24 22:33:48 -05:00
|
|
|
bird := &BirdManager{
|
2026-04-24 22:41:40 -05:00
|
|
|
NodeName: s.Node,
|
|
|
|
|
ConfigPath: "/etc/flock/bird/bird.conf",
|
2026-04-24 22:33:48 -05:00
|
|
|
BirdcSocket: "/run/flock/bird.ctl",
|
2026-04-24 22:41:40 -05:00
|
|
|
Logger: s.Logger,
|
2026-04-24 22:33:48 -05:00
|
|
|
}
|
2026-04-24 22:41:40 -05:00
|
|
|
// Install kernel blackhole routes for the node summary CIDRs. These
|
|
|
|
|
// stay regardless of BGP — they keep the kernel from sending unknown
|
|
|
|
|
// destinations within our /64 to a default route loop.
|
2026-04-24 22:33:48 -05:00
|
|
|
if err := bird.SummaryRoutes(nc); err != nil {
|
|
|
|
|
s.Logger.Warn("install summary routes", "err", err)
|
|
|
|
|
}
|
2026-04-24 23:02:33 -05:00
|
|
|
// Calico is fenced off this node (Tigera Installation CR adds a
|
|
|
|
|
// nodeAffinity excluding flock.fritzlab.net/agent on
|
|
|
|
|
// calicoNodeDaemonSet). flock now owns BGP from this host.
|
2026-04-25 07:36:47 -05:00
|
|
|
routerID := routerIDFromNodeIP(s.restCfg)
|
|
|
|
|
if err := bird.Render(nc, nil, nil, routerID); err != nil {
|
2026-04-24 23:02:33 -05:00
|
|
|
s.Logger.Warn("initial bird render", "err", err)
|
|
|
|
|
}
|
2026-04-25 07:36:47 -05:00
|
|
|
|
|
|
|
|
// AnycastReconciler is the single owner of bird re-renders going
|
|
|
|
|
// forward. It runs every 2s + on Pod readiness changes + on each
|
|
|
|
|
// successful CNI ADD/DEL.
|
|
|
|
|
anycast := NewAnycastReconciler(s.Node, s.Store, pods, s.NodeConfig, bird, routerID, s.Logger)
|
|
|
|
|
pods.OnReadyChange(anycast.Trigger)
|
|
|
|
|
go anycast.Run(ctx)
|
|
|
|
|
|
|
|
|
|
// Background tick for SummaryRoutes (idempotent) in case the kernel
|
|
|
|
|
// blackhole disappears for any reason.
|
2026-04-24 23:02:33 -05:00
|
|
|
go func() {
|
2026-04-25 07:36:47 -05:00
|
|
|
t := time.NewTicker(60 * time.Second)
|
2026-04-24 23:02:33 -05:00
|
|
|
defer t.Stop()
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return
|
|
|
|
|
case <-t.C:
|
2026-04-25 07:36:47 -05:00
|
|
|
if cur := s.NodeConfig.Load(); cur != nil {
|
|
|
|
|
_ = bird.SummaryRoutes(cur)
|
2026-04-24 23:02:33 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}()
|
2026-04-24 22:33:48 -05:00
|
|
|
|
2026-04-25 09:25:58 -05:00
|
|
|
// NetworkPolicy enforcement.
|
|
|
|
|
world := netpol.NewWorld(s.Logger)
|
|
|
|
|
if err := world.Start(ctx, s.restCfg); err != nil {
|
|
|
|
|
return fmt.Errorf("netpol informers: %w", err)
|
|
|
|
|
}
|
|
|
|
|
npApplier := &netpol.Applier{}
|
|
|
|
|
npReconciler := netpol.NewReconciler(world, func() []netpol.Pod {
|
|
|
|
|
return collectLocalPods(s.Store, pods)
|
|
|
|
|
}, npApplier, s.Logger)
|
|
|
|
|
go npReconciler.Run(ctx)
|
|
|
|
|
|
2026-04-24 22:33:48 -05:00
|
|
|
handler := &PodHandler{
|
|
|
|
|
Node: s.Node,
|
|
|
|
|
Store: s.Store,
|
|
|
|
|
IPAM: ipam,
|
|
|
|
|
Pods: pods,
|
|
|
|
|
NodeConfig: s.NodeConfig,
|
|
|
|
|
SetupFunc: Setup,
|
|
|
|
|
TeardownFunc: Teardown,
|
2026-04-25 09:25:58 -05:00
|
|
|
AfterCommit: func() {
|
|
|
|
|
anycast.Trigger()
|
|
|
|
|
// Re-evaluate policy on every CNI ADD/DEL so a brand-new
|
|
|
|
|
// pod's chain lands before its first packet egresses.
|
|
|
|
|
npReconciler.Trigger()
|
|
|
|
|
},
|
2026-04-24 22:33:48 -05:00
|
|
|
}
|
|
|
|
|
s.RPC.SetHandlers(handler.Add, handler.Del, handler.Check)
|
|
|
|
|
s.Logger.Info("runtime ready",
|
|
|
|
|
"asn", nc.Spec.BGP.ASN,
|
|
|
|
|
"cidr6", nc.Spec.CIDR6,
|
|
|
|
|
"cidr4", nc.Spec.CIDR4,
|
|
|
|
|
"committed", len(s.Store.Snapshot()),
|
|
|
|
|
)
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// routerIDFromNodeIP picks a stable IPv4 to use as BIRD router-id. Uses
|
|
|
|
|
// the host network for now; falls back to a synthesized value derived
|
|
|
|
|
// from the node name if no v4 is reachable.
|
|
|
|
|
func routerIDFromNodeIP(_ interface{}) string {
|
|
|
|
|
// Best-effort: read the kernel route table for a default-route src.
|
|
|
|
|
addrs, err := net.InterfaceAddrs()
|
|
|
|
|
if err == nil {
|
|
|
|
|
for _, a := range addrs {
|
|
|
|
|
ipn, ok := a.(*net.IPNet)
|
|
|
|
|
if !ok {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v4 := ipn.IP.To4()
|
|
|
|
|
if v4 == nil || v4.IsLoopback() || v4.IsLinkLocalUnicast() {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
return v4.String()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Fallback: 127.0.0.1 — bird will accept it but BGP peers won't like a
|
|
|
|
|
// duplicate router-id. The agent log will scream above this if it fires.
|
|
|
|
|
return "127.0.0.1"
|
|
|
|
|
}
|