netpol: NetworkPolicy v1 enforcement via nftables

New pkg/agent/netpol implementing standard networking.k8s.io/v1 NetworkPolicy. Pipeline: pods + policies + namespaces → Translate → Render → Apply Supports ingress + egress, all three peer types (podSelector, namespaceSelector, ipBlock with except), numeric ports + port ranges, default-deny semantics derived from PolicyTypes (or inferred from non-empty Spec.Egress when unset). Apply path is `nft -f -` shell-out — single transaction, atomic, kernel guarantees partial-failure rollback. Idempotent dedup via last-applied script. Reconcile triggers: informer events, 30s self-heal tick, every CNI ADD/DEL. Verified against the three live cluster NetPols (calico-apiserver, remote-proxies/lodge-home-assistant, storage/garage-admin-restrict). Fuzz target stitches Translate + Render with random selector and peer inputs; 21 unit tests cover the policy semantics. Named ports skip with a warn — deferred until kubelet exposes them in a form that doesn't require shadowing pod state. Dockerfile: + nftables. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:25:58 -05:00
parent 71e584cf96
commit 39ede9130b
16 changed files with 2698 additions and 2 deletions
@@ -0,0 +1,443 @@
+package netpol
+
+import (
+	"fmt"
+	"net"
+	"sort"
+
+	netv1 "k8s.io/api/networking/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+)
+
+// Inputs is the world-view the translator consumes. All fields are owned
+// by the caller; the translator does not mutate them.
+type Inputs struct {
+	// LocalPods are the pods scheduled on this node that have a committed
+	// flock allocation. Only these pods get rules — peers may live
+	// elsewhere.
+	LocalPods []Pod
+
+	// PeerPods is the cluster-wide pod set used to resolve podSelector +
+	// namespaceSelector peers. It is fine to include the local pods here
+	// too; duplicates are deduped by (namespace, name).
+	PeerPods []PeerPod
+
+	// Namespaces is the cluster's full Namespace set. Used for
+	// namespaceSelector matching.
+	Namespaces []Namespace
+
+	// Policies is every NetworkPolicy in the cluster. The translator
+	// filters down to those that select at least one local pod.
+	Policies []netv1.NetworkPolicy
+}
+
+// Output is the result of one translation pass.
+type Output struct {
+	// Rules is the flat ordered list of allow rules to render. The
+	// renderer groups them by (PodKey, Direction) into chains.
+	Rules []Rule
+
+	// Isolated is the set of (PodKey, Direction) pairs whose chain must
+	// have a default-deny policy. A pod selected by at least one policy
+	// in a given direction shows up here. The renderer uses this to
+	// decide whether to emit a chain at all and what its base policy is.
+	Isolated map[Isolation]struct{}
+
+	// Pods carries the HostIface + IPs for every local pod referenced
+	// by the policy world, including pods that produced only isolation
+	// (default-deny) without any allow rules. The renderer needs this
+	// because such a pod has no Rule to lift the HostIface from.
+	Pods map[string]LocalPod // key = namespace/name
+}
+
+// Isolation is the (PodKey, Direction) key of the Isolated map.
+type Isolation struct {
+	PodKey    string
+	Direction Direction
+}
+
+// Translate runs the translation pass. It is a pure function: same Inputs
+// always produces semantically equal Output. (Order of slices is stable
+// but Rules within a chain follow the order in which selecting policies
+// appear, which is itself sorted; see canonicalisePolicies.)
+//
+// Errors are returned only for unrecoverable malformed input; per-rule
+// translation errors are logged via warn and skipped so that a single
+// broken policy can't take down enforcement for a whole node. The optional
+// warn callback is invoked for each skipped sub-rule with a human-readable
+// message. Pass nil to silently drop.
+func Translate(in Inputs, warn func(string)) (Output, error) {
+	if warn == nil {
+		warn = func(string) {}
+	}
+
+	out := Output{
+		Isolated: map[Isolation]struct{}{},
+		Pods:     map[string]LocalPod{},
+	}
+	policies := canonicalisePolicies(in.Policies)
+	nsByName := indexNamespaces(in.Namespaces)
+	peerPodsByNS := indexPeerPods(in.PeerPods)
+
+	for _, pod := range in.LocalPods {
+		if len(pod.IPs) == 0 {
+			continue // no allocation yet; translator skips
+		}
+		key := pod.Namespace + "/" + pod.Name
+
+		// Find every policy in pod.Namespace whose podSelector matches.
+		// Cross-namespace policies do not select pods outside their own
+		// namespace; that's how the NetworkPolicy spec defines it.
+		for _, p := range policies {
+			if p.Namespace != pod.Namespace {
+				continue
+			}
+			sel, err := metav1.LabelSelectorAsSelector(&p.Spec.PodSelector)
+			if err != nil {
+				warn(fmt.Sprintf("policy %s/%s: invalid podSelector: %v", p.Namespace, p.Name, err))
+				continue
+			}
+			if !sel.Matches(labels.Set(pod.Labels)) {
+				continue
+			}
+
+			ingress, egress := policyDirections(&p)
+			if ingress || egress {
+				out.Pods[key] = LocalPod{
+					PodKey:    key,
+					HostIface: pod.HostIface,
+					IPs:       append([]net.IP(nil), pod.IPs...),
+				}
+			}
+			if ingress {
+				out.Isolated[Isolation{PodKey: key, Direction: DirIngress}] = struct{}{}
+			}
+			if egress {
+				out.Isolated[Isolation{PodKey: key, Direction: DirEgress}] = struct{}{}
+			}
+
+			// Translate ingress rules.
+			if ingress {
+				for ri, r := range p.Spec.Ingress {
+					rules, err := buildIngressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
+					if err != nil {
+						warn(fmt.Sprintf("policy %s/%s ingress[%d]: %v", p.Namespace, p.Name, ri, err))
+						continue
+					}
+					out.Rules = append(out.Rules, rules...)
+				}
+			}
+			// Translate egress rules.
+			if egress {
+				for ri, r := range p.Spec.Egress {
+					rules, err := buildEgressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
+					if err != nil {
+						warn(fmt.Sprintf("policy %s/%s egress[%d]: %v", p.Namespace, p.Name, ri, err))
+						continue
+					}
+					out.Rules = append(out.Rules, rules...)
+				}
+			}
+		}
+	}
+	return out, nil
+}
+
+// policyDirections reports which directions a NetworkPolicy isolates.
+//
+// Per the spec, the PolicyTypes field is the source of truth when set;
+// when omitted, isolation is inferred from which rule lists are populated
+// (Ingress always; Egress only if Spec.Egress is non-empty).
+func policyDirections(p *netv1.NetworkPolicy) (ingress, egress bool) {
+	if len(p.Spec.PolicyTypes) > 0 {
+		for _, t := range p.Spec.PolicyTypes {
+			switch t {
+			case netv1.PolicyTypeIngress:
+				ingress = true
+			case netv1.PolicyTypeEgress:
+				egress = true
+			}
+		}
+		return
+	}
+	ingress = true
+	egress = len(p.Spec.Egress) > 0
+	return
+}
+
+// buildIngressRules expands one NetworkPolicyIngressRule into Rule(s).
+// One Rule per allowed peer-set; each Rule carries the full Ports filter
+// from the source rule.
+func buildIngressRules(
+	pod Pod,
+	r netv1.NetworkPolicyIngressRule,
+	policyNS string,
+	nsByName map[string]Namespace,
+	peerPodsByNS map[string][]PeerPod,
+) ([]Rule, error) {
+	ports, err := translatePorts(r.Ports)
+	if err != nil {
+		return nil, err
+	}
+	peers, err := translatePeers(r.From, policyNS, nsByName, peerPodsByNS)
+	if err != nil {
+		return nil, err
+	}
+	return assembleRules(pod, DirIngress, peers, ports), nil
+}
+
+// buildEgressRules is the egress mirror of buildIngressRules.
+func buildEgressRules(
+	pod Pod,
+	r netv1.NetworkPolicyEgressRule,
+	policyNS string,
+	nsByName map[string]Namespace,
+	peerPodsByNS map[string][]PeerPod,
+) ([]Rule, error) {
+	ports, err := translatePorts(r.Ports)
+	if err != nil {
+		return nil, err
+	}
+	peers, err := translatePeers(r.To, policyNS, nsByName, peerPodsByNS)
+	if err != nil {
+		return nil, err
+	}
+	return assembleRules(pod, DirEgress, peers, ports), nil
+}
+
+// peerSet is the resolved peer information for one rule's From / To list.
+type peerSet struct {
+	// allowAll is true when the rule has no peers at all (an empty From /
+	// To list, which the spec defines as "from anywhere"). It overrides
+	// CIDRs and Except.
+	allowAll bool
+	// CIDRs is the union of every IP / CIDR contributed by the rule's
+	// peer entries (resolved Pod IPs, namespace pods, and ipBlock.cidr).
+	CIDRs []*net.IPNet
+	// Except is the union of every ipBlock.except entry across the rule.
+	Except []*net.IPNet
+}
+
+// translatePeers resolves a list of NetworkPolicyPeer entries into a
+// peerSet. Each peer entry contributes either CIDRs (resolved from
+// pod / namespace selectors, or copied from ipBlock) or Except entries.
+func translatePeers(
+	peers []netv1.NetworkPolicyPeer,
+	policyNS string,
+	nsByName map[string]Namespace,
+	peerPodsByNS map[string][]PeerPod,
+) (peerSet, error) {
+	if len(peers) == 0 {
+		return peerSet{allowAll: true}, nil
+	}
+	out := peerSet{}
+	for i, p := range peers {
+		switch {
+		case p.IPBlock != nil:
+			_, cidr, err := net.ParseCIDR(p.IPBlock.CIDR)
+			if err != nil {
+				return peerSet{}, fmt.Errorf("peer[%d] ipBlock.cidr %q: %w", i, p.IPBlock.CIDR, err)
+			}
+			out.CIDRs = append(out.CIDRs, cidr)
+			for j, ex := range p.IPBlock.Except {
+				_, exNet, err := net.ParseCIDR(ex)
+				if err != nil {
+					return peerSet{}, fmt.Errorf("peer[%d] ipBlock.except[%d] %q: %w", i, j, ex, err)
+				}
+				out.Except = append(out.Except, exNet)
+			}
+		case p.PodSelector != nil || p.NamespaceSelector != nil:
+			ips, err := resolvePodNamespacePeer(p, policyNS, nsByName, peerPodsByNS)
+			if err != nil {
+				return peerSet{}, fmt.Errorf("peer[%d]: %w", i, err)
+			}
+			out.CIDRs = append(out.CIDRs, ips...)
+		default:
+			return peerSet{}, fmt.Errorf("peer[%d] is empty (must set ipBlock, podSelector, or namespaceSelector)", i)
+		}
+	}
+	return out, nil
+}
+
+// resolvePodNamespacePeer walks the cluster's peer-pod set and returns
+// /128 (v6) and /32 (v4) CIDRs for each pod that matches the (possibly
+// combined) pod + namespace selectors.
+//
+// Selector semantics from the NetworkPolicy spec:
+//
+//   - podSelector + namespaceSelector both nil → handled upstream.
+//   - podSelector set, namespaceSelector nil → match in the policy's
+//     own namespace.
+//   - podSelector nil, namespaceSelector set → match every pod in
+//     namespaces that match the namespaceSelector.
+//   - both set → AND: pod must be in a matching namespace AND match
+//     the podSelector.
+//
+// An empty (non-nil) selector matches everything in scope.
+func resolvePodNamespacePeer(
+	p netv1.NetworkPolicyPeer,
+	policyNS string,
+	nsByName map[string]Namespace,
+	peerPodsByNS map[string][]PeerPod,
+) ([]*net.IPNet, error) {
+	var podSel, nsSel labels.Selector
+	if p.PodSelector != nil {
+		s, err := metav1.LabelSelectorAsSelector(p.PodSelector)
+		if err != nil {
+			return nil, fmt.Errorf("podSelector: %w", err)
+		}
+		podSel = s
+	}
+	if p.NamespaceSelector != nil {
+		s, err := metav1.LabelSelectorAsSelector(p.NamespaceSelector)
+		if err != nil {
+			return nil, fmt.Errorf("namespaceSelector: %w", err)
+		}
+		nsSel = s
+	}
+
+	// Decide which namespaces are in scope.
+	var inScope []string
+	if nsSel == nil {
+		// Pod-only selector → just the policy's own namespace.
+		inScope = []string{policyNS}
+	} else {
+		for name, ns := range nsByName {
+			if nsSel.Matches(labels.Set(ns.Labels)) {
+				inScope = append(inScope, name)
+			}
+		}
+	}
+
+	var out []*net.IPNet
+	for _, ns := range inScope {
+		for _, pp := range peerPodsByNS[ns] {
+			if podSel != nil && !podSel.Matches(labels.Set(pp.Labels)) {
+				continue
+			}
+			for _, ip := range pp.IPs {
+				out = append(out, ipToHostCIDR(ip))
+			}
+		}
+	}
+	return out, nil
+}
+
+// translatePorts converts NetworkPolicyPort entries into PortMatch.
+//
+// A nil/empty Ports list on a NetworkPolicy rule means "all ports" by
+// spec; we represent that as a single zero-valued PortMatch (any proto,
+// any port) so the renderer can emit a single rule rather than a chain
+// of port-equality matches.
+func translatePorts(ports []netv1.NetworkPolicyPort) ([]PortMatch, error) {
+	if len(ports) == 0 {
+		return []PortMatch{{}}, nil
+	}
+	var out []PortMatch
+	for i, p := range ports {
+		var protoStr string
+		if p.Protocol != nil {
+			switch *p.Protocol {
+			case "TCP":
+				protoStr = "tcp"
+			case "UDP":
+				protoStr = "udp"
+			case "SCTP":
+				protoStr = "sctp"
+			default:
+				return nil, fmt.Errorf("port[%d]: protocol %q not supported", i, *p.Protocol)
+			}
+		} else {
+			// Spec default: TCP. We use empty string to mean "any of
+			// the three" only when the user explicitly sets neither
+			// protocol nor port; here the user has supplied a Port,
+			// which implies a protocol — and the spec default is TCP.
+			protoStr = "tcp"
+		}
+		var port, endPort int
+		if p.Port != nil {
+			if p.Port.Type != 0 { // intstr.Int = 0; intstr.String = 1
+				return nil, fmt.Errorf("port[%d]: named ports are not yet supported", i)
+			}
+			port = int(p.Port.IntVal)
+		}
+		if p.EndPort != nil {
+			endPort = int(*p.EndPort)
+			if endPort < port {
+				return nil, fmt.Errorf("port[%d]: endPort %d < port %d", i, endPort, port)
+			}
+		}
+		out = append(out, PortMatch{Protocol: protoStr, Port: port, EndPort: endPort})
+	}
+	return out, nil
+}
+
+// assembleRules emits the cross-product of (one peer-set) × (port list).
+// We currently emit a single Rule per direction since the peer-set is the
+// expensive shared field; ports go inline. allowAll peers result in a
+// rule with no PeerCIDRs, which the renderer treats as "any source".
+func assembleRules(pod Pod, dir Direction, peers peerSet, ports []PortMatch) []Rule {
+	if !peers.allowAll && len(peers.CIDRs) == 0 {
+		// Selector matched no peers (e.g. podSelector for a label that
+		// no live pod has). Emit nothing — the rule cannot allow any
+		// real traffic. The pod stays in default-deny for this rule.
+		return nil
+	}
+	r := Rule{
+		PodKey:    pod.Namespace + "/" + pod.Name,
+		HostIface: pod.HostIface,
+		PodIPs:    append([]net.IP(nil), pod.IPs...),
+		Direction: dir,
+		Action:    ActionAccept,
+		Ports:     append([]PortMatch(nil), ports...),
+	}
+	if !peers.allowAll {
+		r.PeerCIDRs = append([]*net.IPNet(nil), peers.CIDRs...)
+		r.PeerExcept = append([]*net.IPNet(nil), peers.Except...)
+	}
+	return []Rule{r}
+}
+
+// canonicalisePolicies sorts the policy slice by (namespace, name) so the
+// translator's output is deterministic regardless of informer event order.
+func canonicalisePolicies(p []netv1.NetworkPolicy) []netv1.NetworkPolicy {
+	out := append([]netv1.NetworkPolicy(nil), p...)
+	sort.Slice(out, func(i, j int) bool {
+		if out[i].Namespace != out[j].Namespace {
+			return out[i].Namespace < out[j].Namespace
+		}
+		return out[i].Name < out[j].Name
+	})
+	return out
+}
+
+func indexNamespaces(nss []Namespace) map[string]Namespace {
+	out := make(map[string]Namespace, len(nss))
+	for _, ns := range nss {
+		out[ns.Name] = ns
+	}
+	return out
+}
+
+func indexPeerPods(pods []PeerPod) map[string][]PeerPod {
+	out := map[string][]PeerPod{}
+	for _, p := range pods {
+		out[p.Namespace] = append(out[p.Namespace], p)
+	}
+	// Sort each namespace's pod list by (name) so the translator's IP
+	// ordering is stable.
+	for k := range out {
+		sort.Slice(out[k], func(i, j int) bool { return out[k][i].Name < out[k][j].Name })
+	}
+	return out
+}
+
+// ipToHostCIDR returns ip/32 (v4) or ip/128 (v6) — the smallest CIDR
+// covering exactly that one address.
+func ipToHostCIDR(ip net.IP) *net.IPNet {
+	if v4 := ip.To4(); v4 != nil {
+		return &net.IPNet{IP: v4, Mask: net.CIDRMask(32, 32)}
+	}
+	return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(128, 128)}
+}