Files
flock/pkg/agent/netns_linux.go
T
Donavan Fritz 2daa2a21f3
Build flock Image / build (push) Successful in 3m23s
agent: add flock.fritzlab.net/addresses annotation (eth0 static IPs)
Like anycast, addresses IPs are advertised via BGP (/128+/32) and get
host routes via the AnycastReconciler. The sole difference: they are
assigned to pod eth0 instead of lo, so workloads that inspect their
primary interface (e.g. Plex remote-access detection) see the public IP
directly.

- annotations.go: annAddresses const, Addresses []net.IP in ParsedAnnotations
- state.go: Addresses []string persisted in allocations.json
- anycast.go: resolveAnycastTargets processes Anycast+Addresses together
- netns_linux.go: configurePodSide assigns Addresses to eth0
- netns_stub.go: mirror Addresses field for non-Linux builds
- handlers.go: thread Addresses through ADD path

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-28 17:50:49 -05:00

336 lines
11 KiB
Go

//go:build linux
package agent
import (
"errors"
"fmt"
"net"
"os"
"runtime"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/vishvananda/netlink"
)
// SetupRequest is the netlink setup input for one pod.
type SetupRequest struct {
ContainerID string
Netns string // /proc/<pid>/ns/net
IfName string // pod-side iface name (typically "eth0")
HostIface string // host-side veth name from HostIfaceName
IP6 net.IP // /128 inside pod
IP4 net.IP // /32 inside pod (may be nil)
// Anycast IPs to add to pod's lo (NOT eth0). Mix of IPv6 and IPv4.
// Host /128 and /32 routes are NOT installed here — that happens once
// the pod becomes Ready, see AnycastReconciler.
Anycast []net.IP
// Addresses are additional IPs to bind directly on pod eth0 (NOT lo).
// BGP advertisement is handled identically to Anycast by the
// AnycastReconciler. Use when the workload needs the IP on its primary
// interface (e.g. Plex remote-access detection).
Addresses []net.IP
}
// LinkLocalGW is the deterministic IPv6 LL gateway placed on every host
// veth. Pod default route uses this as next-hop. Avoids waiting for kernel
// LL DAD on the host side.
var linkLocalGW = net.ParseIP("fe80::1")
// v4ProxyGW is the well-known link-local IPv4 used by container CNIs as a
// next-hop for proxy-arp gateways (cilium, calico, kindnet — all use this).
var v4ProxyGW = net.IPv4(169, 254, 1, 1)
// Setup creates the veth pair, configures the host side, moves the peer
// into the pod netns, configures the pod side, and writes host routes.
// All steps are idempotent: an already-existing object that matches the
// desired state is treated as success.
func Setup(req SetupRequest) error {
if req.HostIface == "" {
req.HostIface = HostIfaceName(req.ContainerID)
}
if req.IfName == "" {
req.IfName = "eth0"
}
// Create veth pair (or reuse existing).
host, peer, err := ensureVeth(req.HostIface, req.IfName)
if err != nil {
return fmt.Errorf("ensure veth: %w", err)
}
// Host-side: addrgenmode none → up → fe80::1/64 → sysctls.
if err := configureHostSide(host); err != nil {
return fmt.Errorf("configure host side %s: %w", host.Attrs().Name, err)
}
// Move peer into pod netns + configure (only if it's still on host).
hostNS, err := ns.GetCurrentNS()
if err != nil {
return fmt.Errorf("get current netns: %w", err)
}
defer hostNS.Close()
if peer != nil {
// Peer is still on the host — move it.
podNS, err := ns.GetNS(req.Netns)
if err != nil {
return fmt.Errorf("open pod netns %s: %w", req.Netns, err)
}
defer podNS.Close()
if err := netlink.LinkSetNsFd(peer, int(podNS.Fd())); err != nil {
return fmt.Errorf("move peer %s into pod ns: %w", peer.Attrs().Name, err)
}
}
// Configure pod-side from inside the pod netns.
if err := configurePodSide(req); err != nil {
return fmt.Errorf("configure pod side: %w", err)
}
// Host route(s): one /128 (and /32 if v4) pointing at the host veth.
if err := setHostRoute(host.Attrs().Index, req.IP6, 128); err != nil {
return fmt.Errorf("host route v6: %w", err)
}
if req.IP4 != nil {
if err := setHostRoute(host.Attrs().Index, req.IP4, 32); err != nil {
return fmt.Errorf("host route v4: %w", err)
}
}
return nil
}
// Teardown removes the host-side veth (which also tears down the peer in
// the pod netns) and the host /128 + /32 routes. All operations are
// idempotent — missing objects are not errors.
func Teardown(containerID string, ip6, ip4 net.IP) error {
hostName := HostIfaceName(containerID)
host, err := netlink.LinkByName(hostName)
if err == nil {
// Routes are removed when the link goes away, but be explicit so
// stale routes can't outlive the veth on a corrupt state.
if ip6 != nil {
_ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip6, 128)})
}
if ip4 != nil {
_ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip4, 32)})
}
if err := netlink.LinkDel(host); err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("delete %s: %w", hostName, err)
}
} else if !linkNotFound(err) {
return fmt.Errorf("lookup %s: %w", hostName, err)
}
return nil
}
// ensureVeth returns the host link (always) and the peer link (only if it's
// still on the host — nil if it has already been moved into a netns).
func ensureVeth(hostName, peerName string) (netlink.Link, netlink.Link, error) {
if existing, err := netlink.LinkByName(hostName); err == nil {
// Already exists; the peer may be on the host or in a netns.
peer, _ := netlink.LinkByName(peerName) // peer name is "eth0" — usually only matches in pod ns
_ = peer
// Don't try to find peer on host by name (collides). Return nil peer; ensureVeth caller treats nil as "already moved".
return existing, nil, nil
}
// Need to create.
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{Name: hostName, MTU: 1500},
PeerName: peerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return nil, nil, fmt.Errorf("link add: %w", err)
}
host, err := netlink.LinkByName(hostName)
if err != nil {
return nil, nil, fmt.Errorf("lookup host after add: %w", err)
}
peer, err := netlink.LinkByName(peerName)
if err != nil {
return nil, nil, fmt.Errorf("lookup peer after add: %w", err)
}
return host, peer, nil
}
func configureHostSide(host netlink.Link) error {
name := host.Attrs().Name
// addrgenmode = none (suppress kernel LL).
if err := netlink.LinkSetVfHardwareAddr(host, 0, nil); err != nil {
// This SetVf isn't the right call; instead use LinkSetGroup or use sysfs directly.
// Fallback: write to /proc/sys/net/ipv6/conf/<iface>/addr_gen_mode = 1
}
_ = sysctlWrite("/proc/sys/net/ipv6/conf/"+name+"/addr_gen_mode", "1")
// Bring up.
if err := netlink.LinkSetUp(host); err != nil {
return fmt.Errorf("set up: %w", err)
}
// fe80::1/64.
addr := &netlink.Addr{IPNet: &net.IPNet{IP: linkLocalGW, Mask: net.CIDRMask(64, 128)}}
if err := netlink.AddrAdd(host, addr); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("addr add fe80::1: %w", err)
}
// sysctls.
for _, kv := range []struct{ k, v string }{
{"/proc/sys/net/ipv4/conf/" + name + "/proxy_arp", "1"},
{"/proc/sys/net/ipv4/conf/" + name + "/forwarding", "1"},
{"/proc/sys/net/ipv6/conf/" + name + "/forwarding", "1"},
} {
if err := sysctlWrite(kv.k, kv.v); err != nil {
return err
}
}
return nil
}
func configurePodSide(req SetupRequest) error {
podNS, err := ns.GetNS(req.Netns)
if err != nil {
return err
}
defer podNS.Close()
return podNS.Do(func(ns.NetNS) error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
eth0, err := netlink.LinkByName(req.IfName)
if err != nil {
return fmt.Errorf("lookup pod %s: %w", req.IfName, err)
}
_ = sysctlWrite("/proc/sys/net/ipv6/conf/"+req.IfName+"/addr_gen_mode", "1")
if err := netlink.LinkSetUp(eth0); err != nil {
return fmt.Errorf("set up pod %s: %w", req.IfName, err)
}
if req.IP6 != nil {
a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP6, Mask: net.CIDRMask(128, 128)}}
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod ip6 add: %w", err)
}
// Default route via fe80::1, no scope on-link issues because LL is reachable on the link.
if err := netlink.RouteAdd(&netlink.Route{
LinkIndex: eth0.Attrs().Index,
Dst: &net.IPNet{IP: net.IPv6zero, Mask: net.CIDRMask(0, 128)},
Gw: linkLocalGW,
}); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod default v6 route: %w", err)
}
}
if req.IP4 != nil {
a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP4, Mask: net.CIDRMask(32, 32)}}
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod ip4 add: %w", err)
}
// On-link route to the proxy gateway, then default via that gateway.
if err := netlink.RouteAdd(&netlink.Route{
LinkIndex: eth0.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: &net.IPNet{IP: v4ProxyGW, Mask: net.CIDRMask(32, 32)},
}); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod onlink v4 route: %w", err)
}
if err := netlink.RouteAdd(&netlink.Route{
LinkIndex: eth0.Attrs().Index,
Dst: &net.IPNet{IP: net.IPv4zero, Mask: net.CIDRMask(0, 32)},
Gw: v4ProxyGW,
}); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod default v4 route: %w", err)
}
}
// Anycast: assign each IP to pod lo, per design doc. NDP/ARP for
// the anycast IP itself never happens because the host route on
// the host side is `<anycast> via <pod-eth0-ip> dev flock<8hex>`.
// The kernel resolves <pod-eth0-ip> via NDP/ARP — and that IP IS
// on eth0, so the pod responds normally.
if len(req.Anycast) > 0 {
lo, err := netlink.LinkByName("lo")
if err != nil {
return fmt.Errorf("lookup pod lo: %w", err)
}
if err := netlink.LinkSetUp(lo); err != nil {
return fmt.Errorf("set up pod lo: %w", err)
}
for _, ip := range req.Anycast {
var mask net.IPMask
if ip.To4() != nil {
mask = net.CIDRMask(32, 32)
ip = ip.To4()
} else {
mask = net.CIDRMask(128, 128)
}
a := &netlink.Addr{IPNet: &net.IPNet{IP: ip, Mask: mask}, Scope: int(netlink.SCOPE_UNIVERSE)}
if err := netlink.AddrAdd(lo, a); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod lo anycast %s: %w", ip, err)
}
}
}
// Addresses: assign directly to pod eth0. Host routing and BGP
// advertisement are handled identically to Anycast by the
// AnycastReconciler (host route via pod-eth0-ip, /128+/32 in BIRD).
for _, ip := range req.Addresses {
var mask net.IPMask
if ip.To4() != nil {
mask = net.CIDRMask(32, 32)
ip = ip.To4()
} else {
mask = net.CIDRMask(128, 128)
}
a := &netlink.Addr{IPNet: &net.IPNet{IP: ip, Mask: mask}, Scope: int(netlink.SCOPE_UNIVERSE)}
if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) {
return fmt.Errorf("pod eth0 address %s: %w", ip, err)
}
}
return nil
})
}
func setHostRoute(linkIndex int, ip net.IP, prefix int) error {
r := &netlink.Route{
LinkIndex: linkIndex,
Scope: netlink.SCOPE_LINK,
Dst: cidrFor(ip, prefix),
}
if err := netlink.RouteReplace(r); err != nil {
return err
}
return nil
}
func cidrFor(ip net.IP, prefix int) *net.IPNet {
if ip.To4() != nil {
return &net.IPNet{IP: ip.To4(), Mask: net.CIDRMask(prefix, 32)}
}
return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(prefix, 128)}
}
func sysctlWrite(path, value string) error {
if err := os.WriteFile(path, []byte(value), 0o644); err != nil {
// Some sysctls don't exist for newly-created interfaces until ipv6 is
// loaded; treat ENOENT as best-effort.
if errors.Is(err, os.ErrNotExist) {
return nil
}
return fmt.Errorf("sysctl %s=%s: %w", path, value, err)
}
return nil
}
func linkNotFound(err error) bool {
if err == nil {
return false
}
var lnf netlink.LinkNotFoundError
return errors.As(err, &lnf)
}