//go:build linux package agent import ( "errors" "fmt" "net" "os" "runtime" "github.com/containernetworking/plugins/pkg/ns" "github.com/vishvananda/netlink" ) // SetupRequest is the netlink setup input for one pod. type SetupRequest struct { ContainerID string Netns string // /proc//ns/net IfName string // pod-side iface name (typically "eth0") HostIface string // host-side veth name from HostIfaceName IP6 net.IP // /128 inside pod IP4 net.IP // /32 inside pod (may be nil) } // LinkLocalGW is the deterministic IPv6 LL gateway placed on every host // veth. Pod default route uses this as next-hop. Avoids waiting for kernel // LL DAD on the host side. var linkLocalGW = net.ParseIP("fe80::1") // v4ProxyGW is the well-known link-local IPv4 used by container CNIs as a // next-hop for proxy-arp gateways (cilium, calico, kindnet — all use this). var v4ProxyGW = net.IPv4(169, 254, 1, 1) // Setup creates the veth pair, configures the host side, moves the peer // into the pod netns, configures the pod side, and writes host routes. // All steps are idempotent: an already-existing object that matches the // desired state is treated as success. func Setup(req SetupRequest) error { if req.HostIface == "" { req.HostIface = HostIfaceName(req.ContainerID) } if req.IfName == "" { req.IfName = "eth0" } // Create veth pair (or reuse existing). host, peer, err := ensureVeth(req.HostIface, req.IfName) if err != nil { return fmt.Errorf("ensure veth: %w", err) } // Host-side: addrgenmode none → up → fe80::1/64 → sysctls. if err := configureHostSide(host); err != nil { return fmt.Errorf("configure host side %s: %w", host.Attrs().Name, err) } // Move peer into pod netns + configure (only if it's still on host). hostNS, err := ns.GetCurrentNS() if err != nil { return fmt.Errorf("get current netns: %w", err) } defer hostNS.Close() if peer != nil { // Peer is still on the host — move it. podNS, err := ns.GetNS(req.Netns) if err != nil { return fmt.Errorf("open pod netns %s: %w", req.Netns, err) } defer podNS.Close() if err := netlink.LinkSetNsFd(peer, int(podNS.Fd())); err != nil { return fmt.Errorf("move peer %s into pod ns: %w", peer.Attrs().Name, err) } } // Configure pod-side from inside the pod netns. if err := configurePodSide(req); err != nil { return fmt.Errorf("configure pod side: %w", err) } // Host route(s): one /128 (and /32 if v4) pointing at the host veth. if err := setHostRoute(host.Attrs().Index, req.IP6, 128); err != nil { return fmt.Errorf("host route v6: %w", err) } if req.IP4 != nil { if err := setHostRoute(host.Attrs().Index, req.IP4, 32); err != nil { return fmt.Errorf("host route v4: %w", err) } } return nil } // Teardown removes the host-side veth (which also tears down the peer in // the pod netns) and the host /128 + /32 routes. All operations are // idempotent — missing objects are not errors. func Teardown(containerID string, ip6, ip4 net.IP) error { hostName := HostIfaceName(containerID) host, err := netlink.LinkByName(hostName) if err == nil { // Routes are removed when the link goes away, but be explicit so // stale routes can't outlive the veth on a corrupt state. if ip6 != nil { _ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip6, 128)}) } if ip4 != nil { _ = netlink.RouteDel(&netlink.Route{LinkIndex: host.Attrs().Index, Dst: cidrFor(ip4, 32)}) } if err := netlink.LinkDel(host); err != nil && !errors.Is(err, os.ErrNotExist) { return fmt.Errorf("delete %s: %w", hostName, err) } } else if !linkNotFound(err) { return fmt.Errorf("lookup %s: %w", hostName, err) } return nil } // ensureVeth returns the host link (always) and the peer link (only if it's // still on the host — nil if it has already been moved into a netns). func ensureVeth(hostName, peerName string) (netlink.Link, netlink.Link, error) { if existing, err := netlink.LinkByName(hostName); err == nil { // Already exists; the peer may be on the host or in a netns. peer, _ := netlink.LinkByName(peerName) // peer name is "eth0" — usually only matches in pod ns _ = peer // Don't try to find peer on host by name (collides). Return nil peer; ensureVeth caller treats nil as "already moved". return existing, nil, nil } // Need to create. veth := &netlink.Veth{ LinkAttrs: netlink.LinkAttrs{Name: hostName, MTU: 1500}, PeerName: peerName, } if err := netlink.LinkAdd(veth); err != nil { return nil, nil, fmt.Errorf("link add: %w", err) } host, err := netlink.LinkByName(hostName) if err != nil { return nil, nil, fmt.Errorf("lookup host after add: %w", err) } peer, err := netlink.LinkByName(peerName) if err != nil { return nil, nil, fmt.Errorf("lookup peer after add: %w", err) } return host, peer, nil } func configureHostSide(host netlink.Link) error { name := host.Attrs().Name // addrgenmode = none (suppress kernel LL). if err := netlink.LinkSetVfHardwareAddr(host, 0, nil); err != nil { // This SetVf isn't the right call; instead use LinkSetGroup or use sysfs directly. // Fallback: write to /proc/sys/net/ipv6/conf//addr_gen_mode = 1 } _ = sysctlWrite("/proc/sys/net/ipv6/conf/"+name+"/addr_gen_mode", "1") // Bring up. if err := netlink.LinkSetUp(host); err != nil { return fmt.Errorf("set up: %w", err) } // fe80::1/64. addr := &netlink.Addr{IPNet: &net.IPNet{IP: linkLocalGW, Mask: net.CIDRMask(64, 128)}} if err := netlink.AddrAdd(host, addr); err != nil && !errors.Is(err, os.ErrExist) { return fmt.Errorf("addr add fe80::1: %w", err) } // sysctls. for _, kv := range []struct{ k, v string }{ {"/proc/sys/net/ipv4/conf/" + name + "/proxy_arp", "1"}, {"/proc/sys/net/ipv4/conf/" + name + "/forwarding", "1"}, {"/proc/sys/net/ipv6/conf/" + name + "/forwarding", "1"}, } { if err := sysctlWrite(kv.k, kv.v); err != nil { return err } } return nil } func configurePodSide(req SetupRequest) error { podNS, err := ns.GetNS(req.Netns) if err != nil { return err } defer podNS.Close() return podNS.Do(func(ns.NetNS) error { runtime.LockOSThread() defer runtime.UnlockOSThread() eth0, err := netlink.LinkByName(req.IfName) if err != nil { return fmt.Errorf("lookup pod %s: %w", req.IfName, err) } _ = sysctlWrite("/proc/sys/net/ipv6/conf/"+req.IfName+"/addr_gen_mode", "1") if err := netlink.LinkSetUp(eth0); err != nil { return fmt.Errorf("set up pod %s: %w", req.IfName, err) } if req.IP6 != nil { a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP6, Mask: net.CIDRMask(128, 128)}} if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) { return fmt.Errorf("pod ip6 add: %w", err) } // Default route via fe80::1, no scope on-link issues because LL is reachable on the link. if err := netlink.RouteAdd(&netlink.Route{ LinkIndex: eth0.Attrs().Index, Dst: &net.IPNet{IP: net.IPv6zero, Mask: net.CIDRMask(0, 128)}, Gw: linkLocalGW, }); err != nil && !errors.Is(err, os.ErrExist) { return fmt.Errorf("pod default v6 route: %w", err) } } if req.IP4 != nil { a := &netlink.Addr{IPNet: &net.IPNet{IP: req.IP4, Mask: net.CIDRMask(32, 32)}} if err := netlink.AddrAdd(eth0, a); err != nil && !errors.Is(err, os.ErrExist) { return fmt.Errorf("pod ip4 add: %w", err) } // On-link route to the proxy gateway, then default via that gateway. if err := netlink.RouteAdd(&netlink.Route{ LinkIndex: eth0.Attrs().Index, Scope: netlink.SCOPE_LINK, Dst: &net.IPNet{IP: v4ProxyGW, Mask: net.CIDRMask(32, 32)}, }); err != nil && !errors.Is(err, os.ErrExist) { return fmt.Errorf("pod onlink v4 route: %w", err) } if err := netlink.RouteAdd(&netlink.Route{ LinkIndex: eth0.Attrs().Index, Dst: &net.IPNet{IP: net.IPv4zero, Mask: net.CIDRMask(0, 32)}, Gw: v4ProxyGW, }); err != nil && !errors.Is(err, os.ErrExist) { return fmt.Errorf("pod default v4 route: %w", err) } } return nil }) } func setHostRoute(linkIndex int, ip net.IP, prefix int) error { r := &netlink.Route{ LinkIndex: linkIndex, Scope: netlink.SCOPE_LINK, Dst: cidrFor(ip, prefix), } if err := netlink.RouteReplace(r); err != nil { return err } return nil } func cidrFor(ip net.IP, prefix int) *net.IPNet { if ip.To4() != nil { return &net.IPNet{IP: ip.To4(), Mask: net.CIDRMask(prefix, 32)} } return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(prefix, 128)} } func sysctlWrite(path, value string) error { if err := os.WriteFile(path, []byte(value), 0o644); err != nil { // Some sysctls don't exist for newly-created interfaces until ipv6 is // loaded; treat ENOENT as best-effort. if errors.Is(err, os.ErrNotExist) { return nil } return fmt.Errorf("sysctl %s=%s: %w", path, value, err) } return nil } func linkNotFound(err error) bool { if err == nil { return false } var lnf netlink.LinkNotFoundError return errors.As(err, &lnf) }