Files
flock/pkg/agent/netns_linux.go
T
Donavan Fritz eb1f5e0d8d
Build flock Image / build (push) Has been cancelled
M2: netlink, IPAM/handler wiring, BIRD sidecar, CNI installer
Code (Linux build, with no-op stubs for macOS dev):
- pkg/agent/netns_linux.go: ensureVeth → host-side configure (addrgenmode
  none, fe80::1/64, proxy_arp, forwarding) → move peer to pod ns →
  configure pod side (addr, default route via fe80::1, v4 169.254.1.1
  on-link gateway) → host /128 + /32 routes. Idempotent.
- pkg/agent/hostiface.go: deterministic host iface name flock<8hex> from
  FNV-1a-32(containerID).
- pkg/agent/annotations.go: parse flock.fritzlab.net/{ipv6,ipv4,cidr6,
  cidr4,ip-algo,anycast} with design-doc defaults; ParseCNIArgs for the
  K8S_POD_* keys kubelet sets.
- pkg/agent/podinfo.go: shared informer scoped to spec.nodeName==NODE,
  WaitForPod helper for ADD-vs-informer-sync race.
- pkg/agent/handlers.go: PodHandler does
    cache lookup → annotations → IPAM → store(pending) → SetupFunc →
    store(committed) → Result. Idempotent on retry. Del symmetric.
- pkg/routing/bird/config.go: text/template render with stable ordering;
  golden tests for host001 + anycast injection + sort stability.
- pkg/agent/bird.go: writes /etc/flock/bird/bird.conf, debounces 500ms,
  execs `birdc -s /run/flock/bird.ctl configure`. Installs blackhole
  kernel routes for the node summary CIDRs so BIRD's protocol kernel
  imports them.
- pkg/agent/runtime_linux.go: at startup, waits up to 60s for the per-
  node NodeConfig, reconciles committed allocations into IPAM.used,
  garbage-collects pending entries, builds PodHandler, swaps RPC
  handlers in.
- cmd/flock-installer: init-container binary that copies /opt/cni/bin/
  flock and writes 01-flock.conflist (lex-first so kubelet picks it
  over Calico's 10-calico.conflist on flock-labeled nodes).

Deploy:
- Dockerfile: alpine + iproute2 + bird2; multi-binary image.
- deploy/daemonset.yaml: install-cni init container; bird sidecar
  sharing /etc/flock/bird + /run/flock with the agent; ConfigMap-seeded
  bootstrap bird.conf so the sidecar boots before the agent renders.
  Privileged on flock-agent + install-cni; bird sidecar uses
  NET_ADMIN/RAW only.
- RBAC: pods + networkpolicies get/list/watch (the latter is reserved
  for M8 — harmless to grant now).

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:33:48 -05:00

282 lines
8.9 KiB
Go

//go:build linux
package agent
import (
"errors"
"fmt"
"net"
"os"
"runtime"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/vishvananda/netlink"
)
// SetupRequest is the netlink setup input for one pod.
type SetupRequest struct {
ContainerID string
Netns string // /proc/<pid>/ns/net
IfName string // pod-side iface name (typically "eth0")
HostIface string // host-side veth name from HostIfaceName
IP6 net.IP // /128 inside pod
IP4 net.IP // /32 inside pod (may be nil)
}
// LinkLocalGW is the deterministic IPv6 LL gateway placed on every host
// veth. Pod default route uses this as next-hop. Avoids waiting for kernel
// LL DAD on the host side.
var linkLocalGW = net.ParseIP("fe80::1")
// v4ProxyGW is the well-known link-local IPv4 used by container CNIs as a
// next-hop for proxy-arp gateways (cilium, calico, kindnet — all use this).
var v4ProxyGW = net.IPv4(169, 254, 1, 1)
// Setup creates the veth pair, configures the host side, moves the peer
// into the pod netns, configures the pod side, and writes host routes.
// All steps are idempotent: an already-existing object that matches the
// desired state is treated as success.
func Setup(req SetupRequest) error {
if req.HostIface == "" {
req.HostIface = HostIfaceName(req.ContainerID)
}
if req.IfName == "" {
req.IfName = "eth0"
}
// Create veth pair (or reuse existing).
host, peer, err := ensureVeth(req.HostIface, req.IfName)
if err != nil {
return fmt.Errorf("ensure veth: %w", err)
}
// Host-side: addrgenmode none → up → fe80::1/64 → sysctls.
if err := configureHostSide(host); err != nil {
return fmt.Errorf("configure host side %s: %w", host.Attrs().Name, err)
}
// Move peer into pod netns + configure (only if it's still on host).
hostNS, err := ns.GetCurrentNS()
if err != nil {
return fmt.Errorf("get current netns: %w", err)
}
defer hostNS.Close()
if peer != nil {
// Peer is still on the host — move it.
podNS, err := ns.GetNS(req.Netns)
if err != nil {
return fmt.Errorf("open pod netns %s: %w", req.Netns, err)
}
defer podNS.Close()
if err := netlink.LinkSetNsFd(peer, int(podNS.Fd())); err != nil {
return fmt.Errorf("move peer %s into pod ns: %w", peer.Attrs().Name, err)
}
}
// Configure pod-side from inside the pod netns.
if err := configurePodSide(req); err != nil {
return fmt.Errorf("configure pod side: %w", err)
}
// Host route(s): one /128 (and /32 if v4) pointing at the host veth.
if err := setHostRoute(host.Attrs().Index, req.IP6, 128); err != nil {
return fmt.Errorf("host route v6: %w", err)
}
if req.IP4 != nil {
if err := setHostRoute(host.Attrs().Index, req.IP4, 32); err != nil {
return fmt.Errorf("host route v4: %w", err)
}
}
return nil
}
// Teardown removes the host-side veth (which also tears down the peer in
// the pod netns) and the host /128 + /32 routes. All operations are
// idempotent — missing objects are not errors.
func Teardown(containerID string, ip6, ip4 net.IP) error {
hostName := HostIfaceName(containerID)
host, err := netlink.LinkByName(hostName)
if err == nil {
// Routes are removed when the link goes away, but be explicit so
// stale routes can't outlive the veth on a corrupt state.
if ip6 != nil {
_ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip6, 128)})
}
if ip4 != nil {
_ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip4, 32)})
}
if err := netlink.LinkDel(host); err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("delete %s: %w", hostName, err)
}
} else if !linkNotFound(err) {
return fmt.Errorf("lookup %s: %w", hostName, err)
}
return nil
}
// ensureVeth returns the host link (always) and the peer link (only if it's
// still on the host — nil if it has already been moved into a netns).
func ensureVeth(hostName, peerName string) (netlink.Link, netlink.Link, error) {
if existing, err := netlink.LinkByName(hostName); err == nil {
// Already exists; the peer may be on the host or in a netns.
peer, _ := netlink.LinkByName(peerName) // peer name is "eth0" — usually only matches in pod ns
_ = peer
// Don't try to find peer on host by name (collides). Return nil peer; ensureVeth caller treats nil as "already moved".
return existing, nil, nil
}
// Need to create.
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{Name: hostName, MTU: 1500},
PeerName: peerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return nil, nil, fmt.Errorf("link add: %w", err)
}
host, err := netlink.LinkByName(hostName)
if err != nil {
return nil, nil, fmt.Errorf("lookup host after add: %w", err)
}
peer, err := netlink.LinkByName(peerName)
if err != nil {
return nil, nil, fmt.Errorf("lookup peer after add: %w", err)
}
return host, peer, nil
}
func configureHostSide(host netlink.Link) error {
name := host.Attrs().Name
// addrgenmode = none (suppress kernel LL).
if err := netlink.LinkSetVfHardwareAddr(host, 0, nil); err != nil {
// This SetVf isn't the right call; instead use LinkSetGroup or use sysfs directly.
// Fallback: write to /proc/sys/net/ipv6/conf/<iface>/addr_gen_mode = 1
}
_ = sysctlWrite("/proc/sys/net/ipv6/conf/"+name+"/addr_gen_mode", "1")
// Bring up.
if err := netlink.LinkSetUp(host); err != nil {
return fmt.Errorf("set up: %w", err)
}
// fe80::1/64.
addr := &netlink.Addr{IPNet: &net.IPNet{IP: linkLocalGW, Mask: net.CIDRMask(64, 128)}}
if err := netlink.AddrAdd(host, addr); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("addr add fe80::1: %w", err)
}
// sysctls.
for _, kv := range []struct{ k, v string }{
{"/proc/sys/net/ipv4/conf/" + name + "/proxy_arp", "1"},
{"/proc/sys/net/ipv4/conf/" + name + "/forwarding", "1"},
{"/proc/sys/net/ipv6/conf/" + name + "/forwarding", "1"},
} {
if err := sysctlWrite(kv.k, kv.v); err != nil {
return err
}
}
return nil
}
func configurePodSide(req SetupRequest) error {
podNS, err := ns.GetNS(req.Netns)
if err != nil {
return err
}
defer podNS.Close()
return podNS.Do(func(ns.NetNS) error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
eth0, err := netlink.LinkByName(req.IfName)
if err != nil {
return fmt.Errorf("lookup pod %s: %w", req.IfName, err)
}
_ = sysctlWrite("/proc/sys/net/ipv6/conf/"+req.IfName+"/addr_gen_mode", "1")
if err := netlink.LinkSetUp(eth0); err != nil {
return fmt.Errorf("set up pod %s: %w", req.IfName, err)
}
if req.IP6 != nil {
a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP6, Mask: net.CIDRMask(128, 128)}}
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod ip6 add: %w", err)
}
// Default route via fe80::1, no scope on-link issues because LL is reachable on the link.
if err := netlink.RouteAdd(&netlink.Route{
LinkIndex: eth0.Attrs().Index,
Dst: &net.IPNet{IP: net.IPv6zero, Mask: net.CIDRMask(0, 128)},
Gw: linkLocalGW,
}); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod default v6 route: %w", err)
}
}
if req.IP4 != nil {
a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP4, Mask: net.CIDRMask(32, 32)}}
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod ip4 add: %w", err)
}
// On-link route to the proxy gateway, then default via that gateway.
if err := netlink.RouteAdd(&netlink.Route{
LinkIndex: eth0.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: &net.IPNet{IP: v4ProxyGW, Mask: net.CIDRMask(32, 32)},
}); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod onlink v4 route: %w", err)
}
if err := netlink.RouteAdd(&netlink.Route{
LinkIndex: eth0.Attrs().Index,
Dst: &net.IPNet{IP: net.IPv4zero, Mask: net.CIDRMask(0, 32)},
Gw: v4ProxyGW,
}); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod default v4 route: %w", err)
}
}
return nil
})
}
func setHostRoute(linkIndex int, ip net.IP, prefix int) error {
r := &netlink.Route{
LinkIndex: linkIndex,
Scope: netlink.SCOPE_LINK,
Dst: cidrFor(ip, prefix),
}
if err := netlink.RouteReplace(r); err != nil {
return err
}
return nil
}
func cidrFor(ip net.IP, prefix int) *net.IPNet {
if ip.To4() != nil {
return &net.IPNet{IP: ip.To4(), Mask: net.CIDRMask(prefix, 32)}
}
return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(prefix, 128)}
}
func sysctlWrite(path, value string) error {
if err := os.WriteFile(path, []byte(value), 0o644); err != nil {
// Some sysctls don't exist for newly-created interfaces until ipv6 is
// loaded; treat ENOENT as best-effort.
if errors.Is(err, os.ErrNotExist) {
return nil
}
return fmt.Errorf("sysctl %s=%s: %w", path, value, err)
}
return nil
}
func linkNotFound(err error) bool {
if err == nil {
return false
}
var lnf netlink.LinkNotFoundError
return errors.As(err, &lnf)
}