netpol: NetworkPolicy v1 enforcement via nftables
Build flock Image / build (push) Has been cancelled

New pkg/agent/netpol implementing standard networking.k8s.io/v1
NetworkPolicy. Pipeline:

  pods + policies + namespaces  →  Translate  →  Render  →  Apply

Supports ingress + egress, all three peer types (podSelector,
namespaceSelector, ipBlock with except), numeric ports + port ranges,
default-deny semantics derived from PolicyTypes (or inferred from
non-empty Spec.Egress when unset).

Apply path is `nft -f -` shell-out — single transaction, atomic, kernel
guarantees partial-failure rollback. Idempotent dedup via last-applied
script. Reconcile triggers: informer events, 30s self-heal tick, every
CNI ADD/DEL.

Verified against the three live cluster NetPols (calico-apiserver,
remote-proxies/lodge-home-assistant, storage/garage-admin-restrict).
Fuzz target stitches Translate + Render with random selector and peer
inputs; 21 unit tests cover the policy semantics.

Named ports skip with a warn — deferred until kubelet exposes them in a
form that doesn't require shadowing pod state.

Dockerfile: + nftables.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Donavan Fritz
2026-04-25 09:25:58 -05:00
parent 71e584cf96
commit 39ede9130b
16 changed files with 2698 additions and 2 deletions
+443
View File
@@ -0,0 +1,443 @@
package netpol
import (
"fmt"
"net"
"sort"
netv1 "k8s.io/api/networking/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
)
// Inputs is the world-view the translator consumes. All fields are owned
// by the caller; the translator does not mutate them.
type Inputs struct {
// LocalPods are the pods scheduled on this node that have a committed
// flock allocation. Only these pods get rules — peers may live
// elsewhere.
LocalPods []Pod
// PeerPods is the cluster-wide pod set used to resolve podSelector +
// namespaceSelector peers. It is fine to include the local pods here
// too; duplicates are deduped by (namespace, name).
PeerPods []PeerPod
// Namespaces is the cluster's full Namespace set. Used for
// namespaceSelector matching.
Namespaces []Namespace
// Policies is every NetworkPolicy in the cluster. The translator
// filters down to those that select at least one local pod.
Policies []netv1.NetworkPolicy
}
// Output is the result of one translation pass.
type Output struct {
// Rules is the flat ordered list of allow rules to render. The
// renderer groups them by (PodKey, Direction) into chains.
Rules []Rule
// Isolated is the set of (PodKey, Direction) pairs whose chain must
// have a default-deny policy. A pod selected by at least one policy
// in a given direction shows up here. The renderer uses this to
// decide whether to emit a chain at all and what its base policy is.
Isolated map[Isolation]struct{}
// Pods carries the HostIface + IPs for every local pod referenced
// by the policy world, including pods that produced only isolation
// (default-deny) without any allow rules. The renderer needs this
// because such a pod has no Rule to lift the HostIface from.
Pods map[string]LocalPod // key = namespace/name
}
// Isolation is the (PodKey, Direction) key of the Isolated map.
type Isolation struct {
PodKey string
Direction Direction
}
// Translate runs the translation pass. It is a pure function: same Inputs
// always produces semantically equal Output. (Order of slices is stable
// but Rules within a chain follow the order in which selecting policies
// appear, which is itself sorted; see canonicalisePolicies.)
//
// Errors are returned only for unrecoverable malformed input; per-rule
// translation errors are logged via warn and skipped so that a single
// broken policy can't take down enforcement for a whole node. The optional
// warn callback is invoked for each skipped sub-rule with a human-readable
// message. Pass nil to silently drop.
func Translate(in Inputs, warn func(string)) (Output, error) {
if warn == nil {
warn = func(string) {}
}
out := Output{
Isolated: map[Isolation]struct{}{},
Pods: map[string]LocalPod{},
}
policies := canonicalisePolicies(in.Policies)
nsByName := indexNamespaces(in.Namespaces)
peerPodsByNS := indexPeerPods(in.PeerPods)
for _, pod := range in.LocalPods {
if len(pod.IPs) == 0 {
continue // no allocation yet; translator skips
}
key := pod.Namespace + "/" + pod.Name
// Find every policy in pod.Namespace whose podSelector matches.
// Cross-namespace policies do not select pods outside their own
// namespace; that's how the NetworkPolicy spec defines it.
for _, p := range policies {
if p.Namespace != pod.Namespace {
continue
}
sel, err := metav1.LabelSelectorAsSelector(&p.Spec.PodSelector)
if err != nil {
warn(fmt.Sprintf("policy %s/%s: invalid podSelector: %v", p.Namespace, p.Name, err))
continue
}
if !sel.Matches(labels.Set(pod.Labels)) {
continue
}
ingress, egress := policyDirections(&p)
if ingress || egress {
out.Pods[key] = LocalPod{
PodKey: key,
HostIface: pod.HostIface,
IPs: append([]net.IP(nil), pod.IPs...),
}
}
if ingress {
out.Isolated[Isolation{PodKey: key, Direction: DirIngress}] = struct{}{}
}
if egress {
out.Isolated[Isolation{PodKey: key, Direction: DirEgress}] = struct{}{}
}
// Translate ingress rules.
if ingress {
for ri, r := range p.Spec.Ingress {
rules, err := buildIngressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
if err != nil {
warn(fmt.Sprintf("policy %s/%s ingress[%d]: %v", p.Namespace, p.Name, ri, err))
continue
}
out.Rules = append(out.Rules, rules...)
}
}
// Translate egress rules.
if egress {
for ri, r := range p.Spec.Egress {
rules, err := buildEgressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
if err != nil {
warn(fmt.Sprintf("policy %s/%s egress[%d]: %v", p.Namespace, p.Name, ri, err))
continue
}
out.Rules = append(out.Rules, rules...)
}
}
}
}
return out, nil
}
// policyDirections reports which directions a NetworkPolicy isolates.
//
// Per the spec, the PolicyTypes field is the source of truth when set;
// when omitted, isolation is inferred from which rule lists are populated
// (Ingress always; Egress only if Spec.Egress is non-empty).
func policyDirections(p *netv1.NetworkPolicy) (ingress, egress bool) {
if len(p.Spec.PolicyTypes) > 0 {
for _, t := range p.Spec.PolicyTypes {
switch t {
case netv1.PolicyTypeIngress:
ingress = true
case netv1.PolicyTypeEgress:
egress = true
}
}
return
}
ingress = true
egress = len(p.Spec.Egress) > 0
return
}
// buildIngressRules expands one NetworkPolicyIngressRule into Rule(s).
// One Rule per allowed peer-set; each Rule carries the full Ports filter
// from the source rule.
func buildIngressRules(
pod Pod,
r netv1.NetworkPolicyIngressRule,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) ([]Rule, error) {
ports, err := translatePorts(r.Ports)
if err != nil {
return nil, err
}
peers, err := translatePeers(r.From, policyNS, nsByName, peerPodsByNS)
if err != nil {
return nil, err
}
return assembleRules(pod, DirIngress, peers, ports), nil
}
// buildEgressRules is the egress mirror of buildIngressRules.
func buildEgressRules(
pod Pod,
r netv1.NetworkPolicyEgressRule,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) ([]Rule, error) {
ports, err := translatePorts(r.Ports)
if err != nil {
return nil, err
}
peers, err := translatePeers(r.To, policyNS, nsByName, peerPodsByNS)
if err != nil {
return nil, err
}
return assembleRules(pod, DirEgress, peers, ports), nil
}
// peerSet is the resolved peer information for one rule's From / To list.
type peerSet struct {
// allowAll is true when the rule has no peers at all (an empty From /
// To list, which the spec defines as "from anywhere"). It overrides
// CIDRs and Except.
allowAll bool
// CIDRs is the union of every IP / CIDR contributed by the rule's
// peer entries (resolved Pod IPs, namespace pods, and ipBlock.cidr).
CIDRs []*net.IPNet
// Except is the union of every ipBlock.except entry across the rule.
Except []*net.IPNet
}
// translatePeers resolves a list of NetworkPolicyPeer entries into a
// peerSet. Each peer entry contributes either CIDRs (resolved from
// pod / namespace selectors, or copied from ipBlock) or Except entries.
func translatePeers(
peers []netv1.NetworkPolicyPeer,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) (peerSet, error) {
if len(peers) == 0 {
return peerSet{allowAll: true}, nil
}
out := peerSet{}
for i, p := range peers {
switch {
case p.IPBlock != nil:
_, cidr, err := net.ParseCIDR(p.IPBlock.CIDR)
if err != nil {
return peerSet{}, fmt.Errorf("peer[%d] ipBlock.cidr %q: %w", i, p.IPBlock.CIDR, err)
}
out.CIDRs = append(out.CIDRs, cidr)
for j, ex := range p.IPBlock.Except {
_, exNet, err := net.ParseCIDR(ex)
if err != nil {
return peerSet{}, fmt.Errorf("peer[%d] ipBlock.except[%d] %q: %w", i, j, ex, err)
}
out.Except = append(out.Except, exNet)
}
case p.PodSelector != nil || p.NamespaceSelector != nil:
ips, err := resolvePodNamespacePeer(p, policyNS, nsByName, peerPodsByNS)
if err != nil {
return peerSet{}, fmt.Errorf("peer[%d]: %w", i, err)
}
out.CIDRs = append(out.CIDRs, ips...)
default:
return peerSet{}, fmt.Errorf("peer[%d] is empty (must set ipBlock, podSelector, or namespaceSelector)", i)
}
}
return out, nil
}
// resolvePodNamespacePeer walks the cluster's peer-pod set and returns
// /128 (v6) and /32 (v4) CIDRs for each pod that matches the (possibly
// combined) pod + namespace selectors.
//
// Selector semantics from the NetworkPolicy spec:
//
// - podSelector + namespaceSelector both nil → handled upstream.
// - podSelector set, namespaceSelector nil → match in the policy's
// own namespace.
// - podSelector nil, namespaceSelector set → match every pod in
// namespaces that match the namespaceSelector.
// - both set → AND: pod must be in a matching namespace AND match
// the podSelector.
//
// An empty (non-nil) selector matches everything in scope.
func resolvePodNamespacePeer(
p netv1.NetworkPolicyPeer,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) ([]*net.IPNet, error) {
var podSel, nsSel labels.Selector
if p.PodSelector != nil {
s, err := metav1.LabelSelectorAsSelector(p.PodSelector)
if err != nil {
return nil, fmt.Errorf("podSelector: %w", err)
}
podSel = s
}
if p.NamespaceSelector != nil {
s, err := metav1.LabelSelectorAsSelector(p.NamespaceSelector)
if err != nil {
return nil, fmt.Errorf("namespaceSelector: %w", err)
}
nsSel = s
}
// Decide which namespaces are in scope.
var inScope []string
if nsSel == nil {
// Pod-only selector → just the policy's own namespace.
inScope = []string{policyNS}
} else {
for name, ns := range nsByName {
if nsSel.Matches(labels.Set(ns.Labels)) {
inScope = append(inScope, name)
}
}
}
var out []*net.IPNet
for _, ns := range inScope {
for _, pp := range peerPodsByNS[ns] {
if podSel != nil && !podSel.Matches(labels.Set(pp.Labels)) {
continue
}
for _, ip := range pp.IPs {
out = append(out, ipToHostCIDR(ip))
}
}
}
return out, nil
}
// translatePorts converts NetworkPolicyPort entries into PortMatch.
//
// A nil/empty Ports list on a NetworkPolicy rule means "all ports" by
// spec; we represent that as a single zero-valued PortMatch (any proto,
// any port) so the renderer can emit a single rule rather than a chain
// of port-equality matches.
func translatePorts(ports []netv1.NetworkPolicyPort) ([]PortMatch, error) {
if len(ports) == 0 {
return []PortMatch{{}}, nil
}
var out []PortMatch
for i, p := range ports {
var protoStr string
if p.Protocol != nil {
switch *p.Protocol {
case "TCP":
protoStr = "tcp"
case "UDP":
protoStr = "udp"
case "SCTP":
protoStr = "sctp"
default:
return nil, fmt.Errorf("port[%d]: protocol %q not supported", i, *p.Protocol)
}
} else {
// Spec default: TCP. We use empty string to mean "any of
// the three" only when the user explicitly sets neither
// protocol nor port; here the user has supplied a Port,
// which implies a protocol — and the spec default is TCP.
protoStr = "tcp"
}
var port, endPort int
if p.Port != nil {
if p.Port.Type != 0 { // intstr.Int = 0; intstr.String = 1
return nil, fmt.Errorf("port[%d]: named ports are not yet supported", i)
}
port = int(p.Port.IntVal)
}
if p.EndPort != nil {
endPort = int(*p.EndPort)
if endPort < port {
return nil, fmt.Errorf("port[%d]: endPort %d < port %d", i, endPort, port)
}
}
out = append(out, PortMatch{Protocol: protoStr, Port: port, EndPort: endPort})
}
return out, nil
}
// assembleRules emits the cross-product of (one peer-set) × (port list).
// We currently emit a single Rule per direction since the peer-set is the
// expensive shared field; ports go inline. allowAll peers result in a
// rule with no PeerCIDRs, which the renderer treats as "any source".
func assembleRules(pod Pod, dir Direction, peers peerSet, ports []PortMatch) []Rule {
if !peers.allowAll && len(peers.CIDRs) == 0 {
// Selector matched no peers (e.g. podSelector for a label that
// no live pod has). Emit nothing — the rule cannot allow any
// real traffic. The pod stays in default-deny for this rule.
return nil
}
r := Rule{
PodKey: pod.Namespace + "/" + pod.Name,
HostIface: pod.HostIface,
PodIPs: append([]net.IP(nil), pod.IPs...),
Direction: dir,
Action: ActionAccept,
Ports: append([]PortMatch(nil), ports...),
}
if !peers.allowAll {
r.PeerCIDRs = append([]*net.IPNet(nil), peers.CIDRs...)
r.PeerExcept = append([]*net.IPNet(nil), peers.Except...)
}
return []Rule{r}
}
// canonicalisePolicies sorts the policy slice by (namespace, name) so the
// translator's output is deterministic regardless of informer event order.
func canonicalisePolicies(p []netv1.NetworkPolicy) []netv1.NetworkPolicy {
out := append([]netv1.NetworkPolicy(nil), p...)
sort.Slice(out, func(i, j int) bool {
if out[i].Namespace != out[j].Namespace {
return out[i].Namespace < out[j].Namespace
}
return out[i].Name < out[j].Name
})
return out
}
func indexNamespaces(nss []Namespace) map[string]Namespace {
out := make(map[string]Namespace, len(nss))
for _, ns := range nss {
out[ns.Name] = ns
}
return out
}
func indexPeerPods(pods []PeerPod) map[string][]PeerPod {
out := map[string][]PeerPod{}
for _, p := range pods {
out[p.Namespace] = append(out[p.Namespace], p)
}
// Sort each namespace's pod list by (name) so the translator's IP
// ordering is stable.
for k := range out {
sort.Slice(out[k], func(i, j int) bool { return out[k][i].Name < out[k][j].Name })
}
return out
}
// ipToHostCIDR returns ip/32 (v4) or ip/128 (v6) — the smallest CIDR
// covering exactly that one address.
func ipToHostCIDR(ip net.IP) *net.IPNet {
if v4 := ip.To4(); v4 != nil {
return &net.IPNet{IP: v4, Mask: net.CIDRMask(32, 32)}
}
return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(128, 128)}
}