netpol: NetworkPolicy v1 enforcement via nftables

New pkg/agent/netpol implementing standard networking.k8s.io/v1 NetworkPolicy. Pipeline: pods + policies + namespaces → Translate → Render → Apply Supports ingress + egress, all three peer types (podSelector, namespaceSelector, ipBlock with except), numeric ports + port ranges, default-deny semantics derived from PolicyTypes (or inferred from non-empty Spec.Egress when unset). Apply path is `nft -f -` shell-out — single transaction, atomic, kernel guarantees partial-failure rollback. Idempotent dedup via last-applied script. Reconcile triggers: informer events, 30s self-heal tick, every CNI ADD/DEL. Verified against the three live cluster NetPols (calico-apiserver, remote-proxies/lodge-home-assistant, storage/garage-admin-restrict). Fuzz target stitches Translate + Render with random selector and peer inputs; 21 unit tests cover the policy semantics. Named ports skip with a warn — deferred until kubelet exposes them in a form that doesn't require shadowing pod state. Dockerfile: + nftables. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:25:58 -05:00
parent 71e584cf96
commit 39ede9130b
16 changed files with 2698 additions and 2 deletions
@@ -0,0 +1,322 @@
+package netpol
+
+import (
+	"fmt"
+	"hash/fnv"
+	"net"
+	"sort"
+	"strings"
+)
+
+// Render produces an nftables script that, when applied with `nft -f -`,
+// installs the desired NetworkPolicy enforcement state for this node.
+//
+// Layout:
+//
+//	table inet flock_netpol {
+//	  chain forward {                      # base chain on hook forward
+//	    type filter hook forward priority filter; policy accept;
+//	    # one jump per (pod, direction) that has rules and/or isolation
+//	    iifname "flock1a2b3c4d" ip6 saddr 2001:db8::1 jump pod_<hash>_egress
+//	    oifname "flock1a2b3c4d" ip6 daddr 2001:db8::1 jump pod_<hash>_ingress
+//	  }
+//	  chain pod_<hash>_ingress {           # one per isolated direction
+//	    # explicit allow lines (empty for default-deny)
+//	    drop
+//	  }
+//	  chain pod_<hash>_egress { ... }
+//	}
+//
+// The whole table is replaced atomically: a "delete table … 2>/dev/null"
+// (best-effort) followed by an "add table" + the chains. nft executes the
+// script as a single transaction; partial application is impossible.
+//
+// Output is deterministic: equal Output → byte-identical script. The
+// reconciler relies on this for de-dup.
+func Render(out Output) string {
+	var sb strings.Builder
+
+	sb.WriteString("# Generated by flock-agent netpol; do not edit by hand.\n")
+	// Best-effort delete; if the table doesn't exist (first run) nft
+	// returns an error, hence the redirect. The "add table" then
+	// recreates everything.
+	sb.WriteString("destroy table inet flock_netpol\n")
+	sb.WriteString("table inet flock_netpol {\n")
+
+	// Build per-(pod, direction) chains. We need them defined BEFORE the
+	// base chain references them, so we render chains first.
+	chains := buildChains(out)
+	for _, c := range chains {
+		writeChain(&sb, c)
+	}
+
+	// Base chain emits jumps in a stable order (chain name asc).
+	sb.WriteString("\tchain forward {\n")
+	sb.WriteString("\t\ttype filter hook forward priority filter; policy accept;\n")
+	for _, c := range chains {
+		writeBaseJump(&sb, c)
+	}
+	sb.WriteString("\t}\n")
+
+	sb.WriteString("}\n")
+	return sb.String()
+}
+
+// chain is one rendered chain — one direction of one pod.
+type chain struct {
+	name      string // pod_<hash>_ingress / _egress
+	hostIface string
+	podIPs    []net.IP
+	direction Direction
+	rules     []Rule
+	policy    string // "drop" or "accept"
+}
+
+// buildChains groups rules by (PodKey, Direction) and adds default-deny
+// chains for isolated directions that received no explicit rules.
+func buildChains(out Output) []chain {
+	type key struct {
+		podKey string
+		dir    Direction
+	}
+	byKey := map[key]*chain{}
+
+	// Seed isolated directions with empty chains so default-deny lands
+	// even when no explicit allow rule was emitted for them.
+	for iso := range out.Isolated {
+		byKey[key{podKey: iso.PodKey, dir: iso.Direction}] = &chain{
+			direction: iso.Direction,
+			policy:    "drop",
+		}
+	}
+
+	// Append rules into their chain. Rule.PodIPs and HostIface are
+	// authoritative — every rule for a given pod carries the same values
+	// (translator invariant), so we copy from the first.
+	for _, r := range out.Rules {
+		k := key{podKey: r.PodKey, dir: r.Direction}
+		c := byKey[k]
+		if c == nil {
+			// Rule for a non-isolated direction shouldn't happen in
+			// practice (translator only emits rules for selected pods)
+			// but be tolerant — the chain just gets policy accept.
+			c = &chain{direction: r.Direction, policy: "accept"}
+			byKey[k] = c
+		}
+		c.rules = append(c.rules, r)
+		if c.hostIface == "" {
+			c.hostIface = r.HostIface
+			c.podIPs = append([]net.IP(nil), r.PodIPs...)
+		}
+	}
+
+	// If a chain was created from Isolated only (no rules), look up the
+	// pod's HostIface + IPs from Output.Pods. This is the path a
+	// default-deny policy takes — no allow rules, only isolation.
+	for k, c := range byKey {
+		if c.hostIface != "" {
+			continue
+		}
+		if lp, ok := out.Pods[k.podKey]; ok {
+			c.hostIface = lp.HostIface
+			c.podIPs = append([]net.IP(nil), lp.IPs...)
+			continue
+		}
+		// Last resort: lift from any rule sharing the PodKey. Should
+		// not normally happen — the translator populates Pods for every
+		// isolated pod — but defends against partially-populated Output
+		// values constructed by tests.
+		for _, r := range out.Rules {
+			if r.PodKey == k.podKey {
+				c.hostIface = r.HostIface
+				c.podIPs = append([]net.IP(nil), r.PodIPs...)
+				break
+			}
+		}
+	}
+
+	// Materialise chain names and emit in deterministic order.
+	var chains []chain
+	for k, c := range byKey {
+		if c.hostIface == "" {
+			continue // can't jump to it; skip
+		}
+		c.name = chainName(k.podKey, c.direction)
+		chains = append(chains, *c)
+	}
+	sort.Slice(chains, func(i, j int) bool { return chains[i].name < chains[j].name })
+	return chains
+}
+
+// chainName produces a stable, name-safe chain identifier. Pod keys can
+// contain characters nft doesn't allow in identifiers, so we hash them.
+// Direction keeps ingress and egress separate.
+func chainName(podKey string, dir Direction) string {
+	h := fnv.New64a()
+	_, _ = h.Write([]byte(podKey))
+	return fmt.Sprintf("pod_%016x_%s", h.Sum64(), dir)
+}
+
+// writeChain emits the chain definition. Empty chains exist deliberately:
+// the chain's drop policy IS the default-deny.
+func writeChain(sb *strings.Builder, c chain) {
+	fmt.Fprintf(sb, "\tchain %s {\n", c.name)
+	for _, r := range c.rules {
+		writeAllowRule(sb, r)
+	}
+	if c.policy == "drop" {
+		sb.WriteString("\t\tdrop\n")
+	}
+	sb.WriteString("\t}\n")
+}
+
+// writeAllowRule emits one accept line:
+//
+//	[ip|ip6 saddr {peers}] [ip|ip6 saddr != {except}] [proto dport {port|port-end}] accept
+//
+// The saddr / daddr field flips based on direction (ingress = from peer →
+// match saddr; egress = to peer → match daddr).
+func writeAllowRule(sb *strings.Builder, r Rule) {
+	v6Peers, v4Peers := splitFamily(r.PeerCIDRs)
+	v6Except, v4Except := splitFamily(r.PeerExcept)
+	v6Pod, v4Pod := splitIPFamily(r.PodIPs)
+	hasPeerFilter := len(r.PeerCIDRs) > 0
+
+	emit := func(family string, peers, except []*net.IPNet, podIP net.IP) {
+		if hasPeerFilter && len(peers) == 0 && len(except) == 0 {
+			// Peer filter exists but no entries of this family — rule
+			// must not match anything for this family.
+			return
+		}
+		if podIP == nil {
+			// Pod has no address of this family; nothing to guard.
+			return
+		}
+		for _, port := range r.Ports {
+			sb.WriteString("\t\t")
+			// Peer (saddr/daddr) match: address is "peer's address",
+			// which is saddr on ingress and daddr on egress.
+			peerField := peerAddrField(family, r.Direction)
+			if hasPeerFilter && len(peers) > 0 {
+				fmt.Fprintf(sb, "%s { %s } ", peerField, joinCIDRs(peers))
+			}
+			if hasPeerFilter && len(except) > 0 {
+				fmt.Fprintf(sb, "%s != { %s } ", peerField, joinCIDRs(except))
+			}
+			// Port match.
+			writePortMatch(sb, port)
+			fmt.Fprintf(sb, "%s\n", r.Action)
+		}
+	}
+	emit("ip6", v6Peers, v6Except, v6Pod)
+	emit("ip", v4Peers, v4Except, v4Pod)
+}
+
+// peerAddrField returns "ip6 saddr" / "ip saddr" / "ip6 daddr" / "ip daddr"
+// depending on family + direction. Ingress matches the peer as the source;
+// egress matches the peer as the destination.
+func peerAddrField(family string, dir Direction) string {
+	switch {
+	case dir == DirIngress:
+		return family + " saddr"
+	default:
+		return family + " daddr"
+	}
+}
+
+// writePortMatch appends "tcp dport 80 " (single port) or
+// "tcp dport 8000-8999 " (range), or nothing when port is "any".
+func writePortMatch(sb *strings.Builder, p PortMatch) {
+	if p.Port == 0 && p.Protocol == "" {
+		return
+	}
+	proto := p.Protocol
+	if proto == "" {
+		proto = "tcp"
+	}
+	if p.Port == 0 {
+		// Protocol-only match. nft has `meta l4proto tcp`.
+		fmt.Fprintf(sb, "meta l4proto %s ", proto)
+		return
+	}
+	if p.EndPort > p.Port {
+		fmt.Fprintf(sb, "%s dport %d-%d ", proto, p.Port, p.EndPort)
+		return
+	}
+	fmt.Fprintf(sb, "%s dport %d ", proto, p.Port)
+}
+
+// writeBaseJump emits one line per (pod, direction) chain in the base
+// `forward` chain. The match is anchored on the host-side veth name so
+// the rule only fires for traffic that genuinely crosses this pod's veth.
+//
+// We additionally constrain on the pod's address (saddr for egress, daddr
+// for ingress) so a packet that somehow hits the wrong veth — e.g. during
+// a CNI ADD race — won't be policy-evaluated against the wrong pod.
+func writeBaseJump(sb *strings.Builder, c chain) {
+	v6, v4 := splitIPFamily(c.podIPs)
+	emit := func(family string, ip net.IP) {
+		if ip == nil {
+			return
+		}
+		var iface, addrField, addrStr string
+		if c.direction == DirEgress {
+			iface = "iifname"
+			addrField = family + " saddr"
+		} else {
+			iface = "oifname"
+			addrField = family + " daddr"
+		}
+		if family == "ip" {
+			addrStr = ip.To4().String()
+		} else {
+			addrStr = ip.To16().String()
+		}
+		fmt.Fprintf(sb, "\t\t%s \"%s\" %s %s jump %s\n", iface, c.hostIface, addrField, addrStr, c.name)
+	}
+	emit("ip6", v6)
+	emit("ip", v4)
+}
+
+// splitFamily partitions CIDRs into (v6, v4) lists, preserving order
+// within each family.
+func splitFamily(cs []*net.IPNet) ([]*net.IPNet, []*net.IPNet) {
+	var v6, v4 []*net.IPNet
+	for _, c := range cs {
+		if c.IP.To4() != nil {
+			v4 = append(v4, c)
+		} else {
+			v6 = append(v6, c)
+		}
+	}
+	return v6, v4
+}
+
+// splitIPFamily picks one v6 and one v4 from a list of pod IPs (a pod has
+// at most one of each in flock's model).
+func splitIPFamily(ips []net.IP) (v6, v4 net.IP) {
+	for _, ip := range ips {
+		if ip == nil {
+			continue
+		}
+		if ip.To4() != nil {
+			if v4 == nil {
+				v4 = ip
+			}
+		} else {
+			if v6 == nil {
+				v6 = ip
+			}
+		}
+	}
+	return
+}
+
+func joinCIDRs(cs []*net.IPNet) string {
+	parts := make([]string, len(cs))
+	for i, c := range cs {
+		parts[i] = c.String()
+	}
+	sort.Strings(parts)
+	return strings.Join(parts, ", ")
+}