M2: netlink, IPAM/handler wiring, BIRD sidecar, CNI installer
Build flock Image / build (push) Has been cancelled
Build flock Image / build (push) Has been cancelled
Code (Linux build, with no-op stubs for macOS dev):
- pkg/agent/netns_linux.go: ensureVeth → host-side configure (addrgenmode
none, fe80::1/64, proxy_arp, forwarding) → move peer to pod ns →
configure pod side (addr, default route via fe80::1, v4 169.254.1.1
on-link gateway) → host /128 + /32 routes. Idempotent.
- pkg/agent/hostiface.go: deterministic host iface name flock<8hex> from
FNV-1a-32(containerID).
- pkg/agent/annotations.go: parse flock.fritzlab.net/{ipv6,ipv4,cidr6,
cidr4,ip-algo,anycast} with design-doc defaults; ParseCNIArgs for the
K8S_POD_* keys kubelet sets.
- pkg/agent/podinfo.go: shared informer scoped to spec.nodeName==NODE,
WaitForPod helper for ADD-vs-informer-sync race.
- pkg/agent/handlers.go: PodHandler does
cache lookup → annotations → IPAM → store(pending) → SetupFunc →
store(committed) → Result. Idempotent on retry. Del symmetric.
- pkg/routing/bird/config.go: text/template render with stable ordering;
golden tests for host001 + anycast injection + sort stability.
- pkg/agent/bird.go: writes /etc/flock/bird/bird.conf, debounces 500ms,
execs `birdc -s /run/flock/bird.ctl configure`. Installs blackhole
kernel routes for the node summary CIDRs so BIRD's protocol kernel
imports them.
- pkg/agent/runtime_linux.go: at startup, waits up to 60s for the per-
node NodeConfig, reconciles committed allocations into IPAM.used,
garbage-collects pending entries, builds PodHandler, swaps RPC
handlers in.
- cmd/flock-installer: init-container binary that copies /opt/cni/bin/
flock and writes 01-flock.conflist (lex-first so kubelet picks it
over Calico's 10-calico.conflist on flock-labeled nodes).
Deploy:
- Dockerfile: alpine + iproute2 + bird2; multi-binary image.
- deploy/daemonset.yaml: install-cni init container; bird sidecar
sharing /etc/flock/bird + /run/flock with the agent; ConfigMap-seeded
bootstrap bird.conf so the sidecar boots before the agent renders.
Privileged on flock-agent + install-cni; bird sidecar uses
NET_ADMIN/RAW only.
- RBAC: pods + networkpolicies get/list/watch (the latter is reserved
for M8 — harmless to grant now).
Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,281 @@
|
||||
//go:build linux
|
||||
|
||||
package agent
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
"github.com/containernetworking/plugins/pkg/ns"
|
||||
"github.com/vishvananda/netlink"
|
||||
)
|
||||
|
||||
// SetupRequest is the netlink setup input for one pod.
|
||||
type SetupRequest struct {
|
||||
ContainerID string
|
||||
Netns string // /proc/<pid>/ns/net
|
||||
IfName string // pod-side iface name (typically "eth0")
|
||||
HostIface string // host-side veth name from HostIfaceName
|
||||
IP6 net.IP // /128 inside pod
|
||||
IP4 net.IP // /32 inside pod (may be nil)
|
||||
}
|
||||
|
||||
// LinkLocalGW is the deterministic IPv6 LL gateway placed on every host
|
||||
// veth. Pod default route uses this as next-hop. Avoids waiting for kernel
|
||||
// LL DAD on the host side.
|
||||
var linkLocalGW = net.ParseIP("fe80::1")
|
||||
|
||||
// v4ProxyGW is the well-known link-local IPv4 used by container CNIs as a
|
||||
// next-hop for proxy-arp gateways (cilium, calico, kindnet — all use this).
|
||||
var v4ProxyGW = net.IPv4(169, 254, 1, 1)
|
||||
|
||||
// Setup creates the veth pair, configures the host side, moves the peer
|
||||
// into the pod netns, configures the pod side, and writes host routes.
|
||||
// All steps are idempotent: an already-existing object that matches the
|
||||
// desired state is treated as success.
|
||||
func Setup(req SetupRequest) error {
|
||||
if req.HostIface == "" {
|
||||
req.HostIface = HostIfaceName(req.ContainerID)
|
||||
}
|
||||
if req.IfName == "" {
|
||||
req.IfName = "eth0"
|
||||
}
|
||||
|
||||
// Create veth pair (or reuse existing).
|
||||
host, peer, err := ensureVeth(req.HostIface, req.IfName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("ensure veth: %w", err)
|
||||
}
|
||||
|
||||
// Host-side: addrgenmode none → up → fe80::1/64 → sysctls.
|
||||
if err := configureHostSide(host); err != nil {
|
||||
return fmt.Errorf("configure host side %s: %w", host.Attrs().Name, err)
|
||||
}
|
||||
|
||||
// Move peer into pod netns + configure (only if it's still on host).
|
||||
hostNS, err := ns.GetCurrentNS()
|
||||
if err != nil {
|
||||
return fmt.Errorf("get current netns: %w", err)
|
||||
}
|
||||
defer hostNS.Close()
|
||||
|
||||
if peer != nil {
|
||||
// Peer is still on the host — move it.
|
||||
podNS, err := ns.GetNS(req.Netns)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open pod netns %s: %w", req.Netns, err)
|
||||
}
|
||||
defer podNS.Close()
|
||||
if err := netlink.LinkSetNsFd(peer, int(podNS.Fd())); err != nil {
|
||||
return fmt.Errorf("move peer %s into pod ns: %w", peer.Attrs().Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Configure pod-side from inside the pod netns.
|
||||
if err := configurePodSide(req); err != nil {
|
||||
return fmt.Errorf("configure pod side: %w", err)
|
||||
}
|
||||
|
||||
// Host route(s): one /128 (and /32 if v4) pointing at the host veth.
|
||||
if err := setHostRoute(host.Attrs().Index, req.IP6, 128); err != nil {
|
||||
return fmt.Errorf("host route v6: %w", err)
|
||||
}
|
||||
if req.IP4 != nil {
|
||||
if err := setHostRoute(host.Attrs().Index, req.IP4, 32); err != nil {
|
||||
return fmt.Errorf("host route v4: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Teardown removes the host-side veth (which also tears down the peer in
|
||||
// the pod netns) and the host /128 + /32 routes. All operations are
|
||||
// idempotent — missing objects are not errors.
|
||||
func Teardown(containerID string, ip6, ip4 net.IP) error {
|
||||
hostName := HostIfaceName(containerID)
|
||||
host, err := netlink.LinkByName(hostName)
|
||||
if err == nil {
|
||||
// Routes are removed when the link goes away, but be explicit so
|
||||
// stale routes can't outlive the veth on a corrupt state.
|
||||
if ip6 != nil {
|
||||
_ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip6, 128)})
|
||||
}
|
||||
if ip4 != nil {
|
||||
_ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip4, 32)})
|
||||
}
|
||||
if err := netlink.LinkDel(host); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return fmt.Errorf("delete %s: %w", hostName, err)
|
||||
}
|
||||
} else if !linkNotFound(err) {
|
||||
return fmt.Errorf("lookup %s: %w", hostName, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensureVeth returns the host link (always) and the peer link (only if it's
|
||||
// still on the host — nil if it has already been moved into a netns).
|
||||
func ensureVeth(hostName, peerName string) (netlink.Link, netlink.Link, error) {
|
||||
if existing, err := netlink.LinkByName(hostName); err == nil {
|
||||
// Already exists; the peer may be on the host or in a netns.
|
||||
peer, _ := netlink.LinkByName(peerName) // peer name is "eth0" — usually only matches in pod ns
|
||||
_ = peer
|
||||
// Don't try to find peer on host by name (collides). Return nil peer; ensureVeth caller treats nil as "already moved".
|
||||
return existing, nil, nil
|
||||
}
|
||||
// Need to create.
|
||||
veth := &netlink.Veth{
|
||||
LinkAttrs: netlink.LinkAttrs{Name: hostName, MTU: 1500},
|
||||
PeerName: peerName,
|
||||
}
|
||||
if err := netlink.LinkAdd(veth); err != nil {
|
||||
return nil, nil, fmt.Errorf("link add: %w", err)
|
||||
}
|
||||
host, err := netlink.LinkByName(hostName)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("lookup host after add: %w", err)
|
||||
}
|
||||
peer, err := netlink.LinkByName(peerName)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("lookup peer after add: %w", err)
|
||||
}
|
||||
return host, peer, nil
|
||||
}
|
||||
|
||||
func configureHostSide(host netlink.Link) error {
|
||||
name := host.Attrs().Name
|
||||
|
||||
// addrgenmode = none (suppress kernel LL).
|
||||
if err := netlink.LinkSetVfHardwareAddr(host, 0, nil); err != nil {
|
||||
// This SetVf isn't the right call; instead use LinkSetGroup or use sysfs directly.
|
||||
// Fallback: write to /proc/sys/net/ipv6/conf/<iface>/addr_gen_mode = 1
|
||||
}
|
||||
_ = sysctlWrite("/proc/sys/net/ipv6/conf/"+name+"/addr_gen_mode", "1")
|
||||
|
||||
// Bring up.
|
||||
if err := netlink.LinkSetUp(host); err != nil {
|
||||
return fmt.Errorf("set up: %w", err)
|
||||
}
|
||||
|
||||
// fe80::1/64.
|
||||
addr := &netlink.Addr{IPNet: &net.IPNet{IP: linkLocalGW, Mask: net.CIDRMask(64, 128)}}
|
||||
if err := netlink.AddrAdd(host, addr); err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return fmt.Errorf("addr add fe80::1: %w", err)
|
||||
}
|
||||
|
||||
// sysctls.
|
||||
for _, kv := range []struct{ k, v string }{
|
||||
{"/proc/sys/net/ipv4/conf/" + name + "/proxy_arp", "1"},
|
||||
{"/proc/sys/net/ipv4/conf/" + name + "/forwarding", "1"},
|
||||
{"/proc/sys/net/ipv6/conf/" + name + "/forwarding", "1"},
|
||||
} {
|
||||
if err := sysctlWrite(kv.k, kv.v); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func configurePodSide(req SetupRequest) error {
|
||||
podNS, err := ns.GetNS(req.Netns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer podNS.Close()
|
||||
|
||||
return podNS.Do(func(ns.NetNS) error {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
eth0, err := netlink.LinkByName(req.IfName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("lookup pod %s: %w", req.IfName, err)
|
||||
}
|
||||
|
||||
_ = sysctlWrite("/proc/sys/net/ipv6/conf/"+req.IfName+"/addr_gen_mode", "1")
|
||||
if err := netlink.LinkSetUp(eth0); err != nil {
|
||||
return fmt.Errorf("set up pod %s: %w", req.IfName, err)
|
||||
}
|
||||
|
||||
if req.IP6 != nil {
|
||||
a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP6, Mask: net.CIDRMask(128, 128)}}
|
||||
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return fmt.Errorf("pod ip6 add: %w", err)
|
||||
}
|
||||
// Default route via fe80::1, no scope on-link issues because LL is reachable on the link.
|
||||
if err := netlink.RouteAdd(&netlink.Route{
|
||||
LinkIndex: eth0.Attrs().Index,
|
||||
Dst: &net.IPNet{IP: net.IPv6zero, Mask: net.CIDRMask(0, 128)},
|
||||
Gw: linkLocalGW,
|
||||
}); err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return fmt.Errorf("pod default v6 route: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if req.IP4 != nil {
|
||||
a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP4, Mask: net.CIDRMask(32, 32)}}
|
||||
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return fmt.Errorf("pod ip4 add: %w", err)
|
||||
}
|
||||
// On-link route to the proxy gateway, then default via that gateway.
|
||||
if err := netlink.RouteAdd(&netlink.Route{
|
||||
LinkIndex: eth0.Attrs().Index,
|
||||
Scope: netlink.SCOPE_LINK,
|
||||
Dst: &net.IPNet{IP: v4ProxyGW, Mask: net.CIDRMask(32, 32)},
|
||||
}); err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return fmt.Errorf("pod onlink v4 route: %w", err)
|
||||
}
|
||||
if err := netlink.RouteAdd(&netlink.Route{
|
||||
LinkIndex: eth0.Attrs().Index,
|
||||
Dst: &net.IPNet{IP: net.IPv4zero, Mask: net.CIDRMask(0, 32)},
|
||||
Gw: v4ProxyGW,
|
||||
}); err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return fmt.Errorf("pod default v4 route: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func setHostRoute(linkIndex int, ip net.IP, prefix int) error {
|
||||
r := &netlink.Route{
|
||||
LinkIndex: linkIndex,
|
||||
Scope: netlink.SCOPE_LINK,
|
||||
Dst: cidrFor(ip, prefix),
|
||||
}
|
||||
if err := netlink.RouteReplace(r); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func cidrFor(ip net.IP, prefix int) *net.IPNet {
|
||||
if ip.To4() != nil {
|
||||
return &net.IPNet{IP: ip.To4(), Mask: net.CIDRMask(prefix, 32)}
|
||||
}
|
||||
return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(prefix, 128)}
|
||||
}
|
||||
|
||||
func sysctlWrite(path, value string) error {
|
||||
if err := os.WriteFile(path, []byte(value), 0o644); err != nil {
|
||||
// Some sysctls don't exist for newly-created interfaces until ipv6 is
|
||||
// loaded; treat ENOENT as best-effort.
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("sysctl %s=%s: %w", path, value, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func linkNotFound(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
var lnf netlink.LinkNotFoundError
|
||||
return errors.As(err, &lnf)
|
||||
}
|
||||
Reference in New Issue
Block a user