Files
flock/pkg/agent/netpol/translator.go
T
Donavan Fritz 39ede9130b
Build flock Image / build (push) Has been cancelled
netpol: NetworkPolicy v1 enforcement via nftables
New pkg/agent/netpol implementing standard networking.k8s.io/v1
NetworkPolicy. Pipeline:

  pods + policies + namespaces  →  Translate  →  Render  →  Apply

Supports ingress + egress, all three peer types (podSelector,
namespaceSelector, ipBlock with except), numeric ports + port ranges,
default-deny semantics derived from PolicyTypes (or inferred from
non-empty Spec.Egress when unset).

Apply path is `nft -f -` shell-out — single transaction, atomic, kernel
guarantees partial-failure rollback. Idempotent dedup via last-applied
script. Reconcile triggers: informer events, 30s self-heal tick, every
CNI ADD/DEL.

Verified against the three live cluster NetPols (calico-apiserver,
remote-proxies/lodge-home-assistant, storage/garage-admin-restrict).
Fuzz target stitches Translate + Render with random selector and peer
inputs; 21 unit tests cover the policy semantics.

Named ports skip with a warn — deferred until kubelet exposes them in a
form that doesn't require shadowing pod state.

Dockerfile: + nftables.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:25:58 -05:00

444 lines
14 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package netpol
import (
"fmt"
"net"
"sort"
netv1 "k8s.io/api/networking/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
)
// Inputs is the world-view the translator consumes. All fields are owned
// by the caller; the translator does not mutate them.
type Inputs struct {
// LocalPods are the pods scheduled on this node that have a committed
// flock allocation. Only these pods get rules — peers may live
// elsewhere.
LocalPods []Pod
// PeerPods is the cluster-wide pod set used to resolve podSelector +
// namespaceSelector peers. It is fine to include the local pods here
// too; duplicates are deduped by (namespace, name).
PeerPods []PeerPod
// Namespaces is the cluster's full Namespace set. Used for
// namespaceSelector matching.
Namespaces []Namespace
// Policies is every NetworkPolicy in the cluster. The translator
// filters down to those that select at least one local pod.
Policies []netv1.NetworkPolicy
}
// Output is the result of one translation pass.
type Output struct {
// Rules is the flat ordered list of allow rules to render. The
// renderer groups them by (PodKey, Direction) into chains.
Rules []Rule
// Isolated is the set of (PodKey, Direction) pairs whose chain must
// have a default-deny policy. A pod selected by at least one policy
// in a given direction shows up here. The renderer uses this to
// decide whether to emit a chain at all and what its base policy is.
Isolated map[Isolation]struct{}
// Pods carries the HostIface + IPs for every local pod referenced
// by the policy world, including pods that produced only isolation
// (default-deny) without any allow rules. The renderer needs this
// because such a pod has no Rule to lift the HostIface from.
Pods map[string]LocalPod // key = namespace/name
}
// Isolation is the (PodKey, Direction) key of the Isolated map.
type Isolation struct {
PodKey string
Direction Direction
}
// Translate runs the translation pass. It is a pure function: same Inputs
// always produces semantically equal Output. (Order of slices is stable
// but Rules within a chain follow the order in which selecting policies
// appear, which is itself sorted; see canonicalisePolicies.)
//
// Errors are returned only for unrecoverable malformed input; per-rule
// translation errors are logged via warn and skipped so that a single
// broken policy can't take down enforcement for a whole node. The optional
// warn callback is invoked for each skipped sub-rule with a human-readable
// message. Pass nil to silently drop.
func Translate(in Inputs, warn func(string)) (Output, error) {
if warn == nil {
warn = func(string) {}
}
out := Output{
Isolated: map[Isolation]struct{}{},
Pods: map[string]LocalPod{},
}
policies := canonicalisePolicies(in.Policies)
nsByName := indexNamespaces(in.Namespaces)
peerPodsByNS := indexPeerPods(in.PeerPods)
for _, pod := range in.LocalPods {
if len(pod.IPs) == 0 {
continue // no allocation yet; translator skips
}
key := pod.Namespace + "/" + pod.Name
// Find every policy in pod.Namespace whose podSelector matches.
// Cross-namespace policies do not select pods outside their own
// namespace; that's how the NetworkPolicy spec defines it.
for _, p := range policies {
if p.Namespace != pod.Namespace {
continue
}
sel, err := metav1.LabelSelectorAsSelector(&p.Spec.PodSelector)
if err != nil {
warn(fmt.Sprintf("policy %s/%s: invalid podSelector: %v", p.Namespace, p.Name, err))
continue
}
if !sel.Matches(labels.Set(pod.Labels)) {
continue
}
ingress, egress := policyDirections(&p)
if ingress || egress {
out.Pods[key] = LocalPod{
PodKey: key,
HostIface: pod.HostIface,
IPs: append([]net.IP(nil), pod.IPs...),
}
}
if ingress {
out.Isolated[Isolation{PodKey: key, Direction: DirIngress}] = struct{}{}
}
if egress {
out.Isolated[Isolation{PodKey: key, Direction: DirEgress}] = struct{}{}
}
// Translate ingress rules.
if ingress {
for ri, r := range p.Spec.Ingress {
rules, err := buildIngressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
if err != nil {
warn(fmt.Sprintf("policy %s/%s ingress[%d]: %v", p.Namespace, p.Name, ri, err))
continue
}
out.Rules = append(out.Rules, rules...)
}
}
// Translate egress rules.
if egress {
for ri, r := range p.Spec.Egress {
rules, err := buildEgressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
if err != nil {
warn(fmt.Sprintf("policy %s/%s egress[%d]: %v", p.Namespace, p.Name, ri, err))
continue
}
out.Rules = append(out.Rules, rules...)
}
}
}
}
return out, nil
}
// policyDirections reports which directions a NetworkPolicy isolates.
//
// Per the spec, the PolicyTypes field is the source of truth when set;
// when omitted, isolation is inferred from which rule lists are populated
// (Ingress always; Egress only if Spec.Egress is non-empty).
func policyDirections(p *netv1.NetworkPolicy) (ingress, egress bool) {
if len(p.Spec.PolicyTypes) > 0 {
for _, t := range p.Spec.PolicyTypes {
switch t {
case netv1.PolicyTypeIngress:
ingress = true
case netv1.PolicyTypeEgress:
egress = true
}
}
return
}
ingress = true
egress = len(p.Spec.Egress) > 0
return
}
// buildIngressRules expands one NetworkPolicyIngressRule into Rule(s).
// One Rule per allowed peer-set; each Rule carries the full Ports filter
// from the source rule.
func buildIngressRules(
pod Pod,
r netv1.NetworkPolicyIngressRule,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) ([]Rule, error) {
ports, err := translatePorts(r.Ports)
if err != nil {
return nil, err
}
peers, err := translatePeers(r.From, policyNS, nsByName, peerPodsByNS)
if err != nil {
return nil, err
}
return assembleRules(pod, DirIngress, peers, ports), nil
}
// buildEgressRules is the egress mirror of buildIngressRules.
func buildEgressRules(
pod Pod,
r netv1.NetworkPolicyEgressRule,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) ([]Rule, error) {
ports, err := translatePorts(r.Ports)
if err != nil {
return nil, err
}
peers, err := translatePeers(r.To, policyNS, nsByName, peerPodsByNS)
if err != nil {
return nil, err
}
return assembleRules(pod, DirEgress, peers, ports), nil
}
// peerSet is the resolved peer information for one rule's From / To list.
type peerSet struct {
// allowAll is true when the rule has no peers at all (an empty From /
// To list, which the spec defines as "from anywhere"). It overrides
// CIDRs and Except.
allowAll bool
// CIDRs is the union of every IP / CIDR contributed by the rule's
// peer entries (resolved Pod IPs, namespace pods, and ipBlock.cidr).
CIDRs []*net.IPNet
// Except is the union of every ipBlock.except entry across the rule.
Except []*net.IPNet
}
// translatePeers resolves a list of NetworkPolicyPeer entries into a
// peerSet. Each peer entry contributes either CIDRs (resolved from
// pod / namespace selectors, or copied from ipBlock) or Except entries.
func translatePeers(
peers []netv1.NetworkPolicyPeer,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) (peerSet, error) {
if len(peers) == 0 {
return peerSet{allowAll: true}, nil
}
out := peerSet{}
for i, p := range peers {
switch {
case p.IPBlock != nil:
_, cidr, err := net.ParseCIDR(p.IPBlock.CIDR)
if err != nil {
return peerSet{}, fmt.Errorf("peer[%d] ipBlock.cidr %q: %w", i, p.IPBlock.CIDR, err)
}
out.CIDRs = append(out.CIDRs, cidr)
for j, ex := range p.IPBlock.Except {
_, exNet, err := net.ParseCIDR(ex)
if err != nil {
return peerSet{}, fmt.Errorf("peer[%d] ipBlock.except[%d] %q: %w", i, j, ex, err)
}
out.Except = append(out.Except, exNet)
}
case p.PodSelector != nil || p.NamespaceSelector != nil:
ips, err := resolvePodNamespacePeer(p, policyNS, nsByName, peerPodsByNS)
if err != nil {
return peerSet{}, fmt.Errorf("peer[%d]: %w", i, err)
}
out.CIDRs = append(out.CIDRs, ips...)
default:
return peerSet{}, fmt.Errorf("peer[%d] is empty (must set ipBlock, podSelector, or namespaceSelector)", i)
}
}
return out, nil
}
// resolvePodNamespacePeer walks the cluster's peer-pod set and returns
// /128 (v6) and /32 (v4) CIDRs for each pod that matches the (possibly
// combined) pod + namespace selectors.
//
// Selector semantics from the NetworkPolicy spec:
//
// - podSelector + namespaceSelector both nil → handled upstream.
// - podSelector set, namespaceSelector nil → match in the policy's
// own namespace.
// - podSelector nil, namespaceSelector set → match every pod in
// namespaces that match the namespaceSelector.
// - both set → AND: pod must be in a matching namespace AND match
// the podSelector.
//
// An empty (non-nil) selector matches everything in scope.
func resolvePodNamespacePeer(
p netv1.NetworkPolicyPeer,
policyNS string,
nsByName map[string]Namespace,
peerPodsByNS map[string][]PeerPod,
) ([]*net.IPNet, error) {
var podSel, nsSel labels.Selector
if p.PodSelector != nil {
s, err := metav1.LabelSelectorAsSelector(p.PodSelector)
if err != nil {
return nil, fmt.Errorf("podSelector: %w", err)
}
podSel = s
}
if p.NamespaceSelector != nil {
s, err := metav1.LabelSelectorAsSelector(p.NamespaceSelector)
if err != nil {
return nil, fmt.Errorf("namespaceSelector: %w", err)
}
nsSel = s
}
// Decide which namespaces are in scope.
var inScope []string
if nsSel == nil {
// Pod-only selector → just the policy's own namespace.
inScope = []string{policyNS}
} else {
for name, ns := range nsByName {
if nsSel.Matches(labels.Set(ns.Labels)) {
inScope = append(inScope, name)
}
}
}
var out []*net.IPNet
for _, ns := range inScope {
for _, pp := range peerPodsByNS[ns] {
if podSel != nil && !podSel.Matches(labels.Set(pp.Labels)) {
continue
}
for _, ip := range pp.IPs {
out = append(out, ipToHostCIDR(ip))
}
}
}
return out, nil
}
// translatePorts converts NetworkPolicyPort entries into PortMatch.
//
// A nil/empty Ports list on a NetworkPolicy rule means "all ports" by
// spec; we represent that as a single zero-valued PortMatch (any proto,
// any port) so the renderer can emit a single rule rather than a chain
// of port-equality matches.
func translatePorts(ports []netv1.NetworkPolicyPort) ([]PortMatch, error) {
if len(ports) == 0 {
return []PortMatch{{}}, nil
}
var out []PortMatch
for i, p := range ports {
var protoStr string
if p.Protocol != nil {
switch *p.Protocol {
case "TCP":
protoStr = "tcp"
case "UDP":
protoStr = "udp"
case "SCTP":
protoStr = "sctp"
default:
return nil, fmt.Errorf("port[%d]: protocol %q not supported", i, *p.Protocol)
}
} else {
// Spec default: TCP. We use empty string to mean "any of
// the three" only when the user explicitly sets neither
// protocol nor port; here the user has supplied a Port,
// which implies a protocol — and the spec default is TCP.
protoStr = "tcp"
}
var port, endPort int
if p.Port != nil {
if p.Port.Type != 0 { // intstr.Int = 0; intstr.String = 1
return nil, fmt.Errorf("port[%d]: named ports are not yet supported", i)
}
port = int(p.Port.IntVal)
}
if p.EndPort != nil {
endPort = int(*p.EndPort)
if endPort < port {
return nil, fmt.Errorf("port[%d]: endPort %d < port %d", i, endPort, port)
}
}
out = append(out, PortMatch{Protocol: protoStr, Port: port, EndPort: endPort})
}
return out, nil
}
// assembleRules emits the cross-product of (one peer-set) × (port list).
// We currently emit a single Rule per direction since the peer-set is the
// expensive shared field; ports go inline. allowAll peers result in a
// rule with no PeerCIDRs, which the renderer treats as "any source".
func assembleRules(pod Pod, dir Direction, peers peerSet, ports []PortMatch) []Rule {
if !peers.allowAll && len(peers.CIDRs) == 0 {
// Selector matched no peers (e.g. podSelector for a label that
// no live pod has). Emit nothing — the rule cannot allow any
// real traffic. The pod stays in default-deny for this rule.
return nil
}
r := Rule{
PodKey: pod.Namespace + "/" + pod.Name,
HostIface: pod.HostIface,
PodIPs: append([]net.IP(nil), pod.IPs...),
Direction: dir,
Action: ActionAccept,
Ports: append([]PortMatch(nil), ports...),
}
if !peers.allowAll {
r.PeerCIDRs = append([]*net.IPNet(nil), peers.CIDRs...)
r.PeerExcept = append([]*net.IPNet(nil), peers.Except...)
}
return []Rule{r}
}
// canonicalisePolicies sorts the policy slice by (namespace, name) so the
// translator's output is deterministic regardless of informer event order.
func canonicalisePolicies(p []netv1.NetworkPolicy) []netv1.NetworkPolicy {
out := append([]netv1.NetworkPolicy(nil), p...)
sort.Slice(out, func(i, j int) bool {
if out[i].Namespace != out[j].Namespace {
return out[i].Namespace < out[j].Namespace
}
return out[i].Name < out[j].Name
})
return out
}
func indexNamespaces(nss []Namespace) map[string]Namespace {
out := make(map[string]Namespace, len(nss))
for _, ns := range nss {
out[ns.Name] = ns
}
return out
}
func indexPeerPods(pods []PeerPod) map[string][]PeerPod {
out := map[string][]PeerPod{}
for _, p := range pods {
out[p.Namespace] = append(out[p.Namespace], p)
}
// Sort each namespace's pod list by (name) so the translator's IP
// ordering is stable.
for k := range out {
sort.Slice(out[k], func(i, j int) bool { return out[k][i].Name < out[k][j].Name })
}
return out
}
// ipToHostCIDR returns ip/32 (v4) or ip/128 (v6) — the smallest CIDR
// covering exactly that one address.
func ipToHostCIDR(ip net.IP) *net.IPNet {
if v4 := ip.To4(); v4 != nil {
return &net.IPNet{IP: v4, Mask: net.CIDRMask(32, 32)}
}
return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(128, 128)}
}