39ede9130b
Build flock Image / build (push) Has been cancelled
New pkg/agent/netpol implementing standard networking.k8s.io/v1 NetworkPolicy. Pipeline: pods + policies + namespaces → Translate → Render → Apply Supports ingress + egress, all three peer types (podSelector, namespaceSelector, ipBlock with except), numeric ports + port ranges, default-deny semantics derived from PolicyTypes (or inferred from non-empty Spec.Egress when unset). Apply path is `nft -f -` shell-out — single transaction, atomic, kernel guarantees partial-failure rollback. Idempotent dedup via last-applied script. Reconcile triggers: informer events, 30s self-heal tick, every CNI ADD/DEL. Verified against the three live cluster NetPols (calico-apiserver, remote-proxies/lodge-home-assistant, storage/garage-admin-restrict). Fuzz target stitches Translate + Render with random selector and peer inputs; 21 unit tests cover the policy semantics. Named ports skip with a warn — deferred until kubelet exposes them in a form that doesn't require shadowing pod state. Dockerfile: + nftables. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
444 lines
14 KiB
Go
444 lines
14 KiB
Go
package netpol
|
||
|
||
import (
|
||
"fmt"
|
||
"net"
|
||
"sort"
|
||
|
||
netv1 "k8s.io/api/networking/v1"
|
||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||
"k8s.io/apimachinery/pkg/labels"
|
||
)
|
||
|
||
// Inputs is the world-view the translator consumes. All fields are owned
|
||
// by the caller; the translator does not mutate them.
|
||
type Inputs struct {
|
||
// LocalPods are the pods scheduled on this node that have a committed
|
||
// flock allocation. Only these pods get rules — peers may live
|
||
// elsewhere.
|
||
LocalPods []Pod
|
||
|
||
// PeerPods is the cluster-wide pod set used to resolve podSelector +
|
||
// namespaceSelector peers. It is fine to include the local pods here
|
||
// too; duplicates are deduped by (namespace, name).
|
||
PeerPods []PeerPod
|
||
|
||
// Namespaces is the cluster's full Namespace set. Used for
|
||
// namespaceSelector matching.
|
||
Namespaces []Namespace
|
||
|
||
// Policies is every NetworkPolicy in the cluster. The translator
|
||
// filters down to those that select at least one local pod.
|
||
Policies []netv1.NetworkPolicy
|
||
}
|
||
|
||
// Output is the result of one translation pass.
|
||
type Output struct {
|
||
// Rules is the flat ordered list of allow rules to render. The
|
||
// renderer groups them by (PodKey, Direction) into chains.
|
||
Rules []Rule
|
||
|
||
// Isolated is the set of (PodKey, Direction) pairs whose chain must
|
||
// have a default-deny policy. A pod selected by at least one policy
|
||
// in a given direction shows up here. The renderer uses this to
|
||
// decide whether to emit a chain at all and what its base policy is.
|
||
Isolated map[Isolation]struct{}
|
||
|
||
// Pods carries the HostIface + IPs for every local pod referenced
|
||
// by the policy world, including pods that produced only isolation
|
||
// (default-deny) without any allow rules. The renderer needs this
|
||
// because such a pod has no Rule to lift the HostIface from.
|
||
Pods map[string]LocalPod // key = namespace/name
|
||
}
|
||
|
||
// Isolation is the (PodKey, Direction) key of the Isolated map.
|
||
type Isolation struct {
|
||
PodKey string
|
||
Direction Direction
|
||
}
|
||
|
||
// Translate runs the translation pass. It is a pure function: same Inputs
|
||
// always produces semantically equal Output. (Order of slices is stable
|
||
// but Rules within a chain follow the order in which selecting policies
|
||
// appear, which is itself sorted; see canonicalisePolicies.)
|
||
//
|
||
// Errors are returned only for unrecoverable malformed input; per-rule
|
||
// translation errors are logged via warn and skipped so that a single
|
||
// broken policy can't take down enforcement for a whole node. The optional
|
||
// warn callback is invoked for each skipped sub-rule with a human-readable
|
||
// message. Pass nil to silently drop.
|
||
func Translate(in Inputs, warn func(string)) (Output, error) {
|
||
if warn == nil {
|
||
warn = func(string) {}
|
||
}
|
||
|
||
out := Output{
|
||
Isolated: map[Isolation]struct{}{},
|
||
Pods: map[string]LocalPod{},
|
||
}
|
||
policies := canonicalisePolicies(in.Policies)
|
||
nsByName := indexNamespaces(in.Namespaces)
|
||
peerPodsByNS := indexPeerPods(in.PeerPods)
|
||
|
||
for _, pod := range in.LocalPods {
|
||
if len(pod.IPs) == 0 {
|
||
continue // no allocation yet; translator skips
|
||
}
|
||
key := pod.Namespace + "/" + pod.Name
|
||
|
||
// Find every policy in pod.Namespace whose podSelector matches.
|
||
// Cross-namespace policies do not select pods outside their own
|
||
// namespace; that's how the NetworkPolicy spec defines it.
|
||
for _, p := range policies {
|
||
if p.Namespace != pod.Namespace {
|
||
continue
|
||
}
|
||
sel, err := metav1.LabelSelectorAsSelector(&p.Spec.PodSelector)
|
||
if err != nil {
|
||
warn(fmt.Sprintf("policy %s/%s: invalid podSelector: %v", p.Namespace, p.Name, err))
|
||
continue
|
||
}
|
||
if !sel.Matches(labels.Set(pod.Labels)) {
|
||
continue
|
||
}
|
||
|
||
ingress, egress := policyDirections(&p)
|
||
if ingress || egress {
|
||
out.Pods[key] = LocalPod{
|
||
PodKey: key,
|
||
HostIface: pod.HostIface,
|
||
IPs: append([]net.IP(nil), pod.IPs...),
|
||
}
|
||
}
|
||
if ingress {
|
||
out.Isolated[Isolation{PodKey: key, Direction: DirIngress}] = struct{}{}
|
||
}
|
||
if egress {
|
||
out.Isolated[Isolation{PodKey: key, Direction: DirEgress}] = struct{}{}
|
||
}
|
||
|
||
// Translate ingress rules.
|
||
if ingress {
|
||
for ri, r := range p.Spec.Ingress {
|
||
rules, err := buildIngressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
|
||
if err != nil {
|
||
warn(fmt.Sprintf("policy %s/%s ingress[%d]: %v", p.Namespace, p.Name, ri, err))
|
||
continue
|
||
}
|
||
out.Rules = append(out.Rules, rules...)
|
||
}
|
||
}
|
||
// Translate egress rules.
|
||
if egress {
|
||
for ri, r := range p.Spec.Egress {
|
||
rules, err := buildEgressRules(pod, r, p.Namespace, nsByName, peerPodsByNS)
|
||
if err != nil {
|
||
warn(fmt.Sprintf("policy %s/%s egress[%d]: %v", p.Namespace, p.Name, ri, err))
|
||
continue
|
||
}
|
||
out.Rules = append(out.Rules, rules...)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// policyDirections reports which directions a NetworkPolicy isolates.
|
||
//
|
||
// Per the spec, the PolicyTypes field is the source of truth when set;
|
||
// when omitted, isolation is inferred from which rule lists are populated
|
||
// (Ingress always; Egress only if Spec.Egress is non-empty).
|
||
func policyDirections(p *netv1.NetworkPolicy) (ingress, egress bool) {
|
||
if len(p.Spec.PolicyTypes) > 0 {
|
||
for _, t := range p.Spec.PolicyTypes {
|
||
switch t {
|
||
case netv1.PolicyTypeIngress:
|
||
ingress = true
|
||
case netv1.PolicyTypeEgress:
|
||
egress = true
|
||
}
|
||
}
|
||
return
|
||
}
|
||
ingress = true
|
||
egress = len(p.Spec.Egress) > 0
|
||
return
|
||
}
|
||
|
||
// buildIngressRules expands one NetworkPolicyIngressRule into Rule(s).
|
||
// One Rule per allowed peer-set; each Rule carries the full Ports filter
|
||
// from the source rule.
|
||
func buildIngressRules(
|
||
pod Pod,
|
||
r netv1.NetworkPolicyIngressRule,
|
||
policyNS string,
|
||
nsByName map[string]Namespace,
|
||
peerPodsByNS map[string][]PeerPod,
|
||
) ([]Rule, error) {
|
||
ports, err := translatePorts(r.Ports)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
peers, err := translatePeers(r.From, policyNS, nsByName, peerPodsByNS)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return assembleRules(pod, DirIngress, peers, ports), nil
|
||
}
|
||
|
||
// buildEgressRules is the egress mirror of buildIngressRules.
|
||
func buildEgressRules(
|
||
pod Pod,
|
||
r netv1.NetworkPolicyEgressRule,
|
||
policyNS string,
|
||
nsByName map[string]Namespace,
|
||
peerPodsByNS map[string][]PeerPod,
|
||
) ([]Rule, error) {
|
||
ports, err := translatePorts(r.Ports)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
peers, err := translatePeers(r.To, policyNS, nsByName, peerPodsByNS)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return assembleRules(pod, DirEgress, peers, ports), nil
|
||
}
|
||
|
||
// peerSet is the resolved peer information for one rule's From / To list.
|
||
type peerSet struct {
|
||
// allowAll is true when the rule has no peers at all (an empty From /
|
||
// To list, which the spec defines as "from anywhere"). It overrides
|
||
// CIDRs and Except.
|
||
allowAll bool
|
||
// CIDRs is the union of every IP / CIDR contributed by the rule's
|
||
// peer entries (resolved Pod IPs, namespace pods, and ipBlock.cidr).
|
||
CIDRs []*net.IPNet
|
||
// Except is the union of every ipBlock.except entry across the rule.
|
||
Except []*net.IPNet
|
||
}
|
||
|
||
// translatePeers resolves a list of NetworkPolicyPeer entries into a
|
||
// peerSet. Each peer entry contributes either CIDRs (resolved from
|
||
// pod / namespace selectors, or copied from ipBlock) or Except entries.
|
||
func translatePeers(
|
||
peers []netv1.NetworkPolicyPeer,
|
||
policyNS string,
|
||
nsByName map[string]Namespace,
|
||
peerPodsByNS map[string][]PeerPod,
|
||
) (peerSet, error) {
|
||
if len(peers) == 0 {
|
||
return peerSet{allowAll: true}, nil
|
||
}
|
||
out := peerSet{}
|
||
for i, p := range peers {
|
||
switch {
|
||
case p.IPBlock != nil:
|
||
_, cidr, err := net.ParseCIDR(p.IPBlock.CIDR)
|
||
if err != nil {
|
||
return peerSet{}, fmt.Errorf("peer[%d] ipBlock.cidr %q: %w", i, p.IPBlock.CIDR, err)
|
||
}
|
||
out.CIDRs = append(out.CIDRs, cidr)
|
||
for j, ex := range p.IPBlock.Except {
|
||
_, exNet, err := net.ParseCIDR(ex)
|
||
if err != nil {
|
||
return peerSet{}, fmt.Errorf("peer[%d] ipBlock.except[%d] %q: %w", i, j, ex, err)
|
||
}
|
||
out.Except = append(out.Except, exNet)
|
||
}
|
||
case p.PodSelector != nil || p.NamespaceSelector != nil:
|
||
ips, err := resolvePodNamespacePeer(p, policyNS, nsByName, peerPodsByNS)
|
||
if err != nil {
|
||
return peerSet{}, fmt.Errorf("peer[%d]: %w", i, err)
|
||
}
|
||
out.CIDRs = append(out.CIDRs, ips...)
|
||
default:
|
||
return peerSet{}, fmt.Errorf("peer[%d] is empty (must set ipBlock, podSelector, or namespaceSelector)", i)
|
||
}
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// resolvePodNamespacePeer walks the cluster's peer-pod set and returns
|
||
// /128 (v6) and /32 (v4) CIDRs for each pod that matches the (possibly
|
||
// combined) pod + namespace selectors.
|
||
//
|
||
// Selector semantics from the NetworkPolicy spec:
|
||
//
|
||
// - podSelector + namespaceSelector both nil → handled upstream.
|
||
// - podSelector set, namespaceSelector nil → match in the policy's
|
||
// own namespace.
|
||
// - podSelector nil, namespaceSelector set → match every pod in
|
||
// namespaces that match the namespaceSelector.
|
||
// - both set → AND: pod must be in a matching namespace AND match
|
||
// the podSelector.
|
||
//
|
||
// An empty (non-nil) selector matches everything in scope.
|
||
func resolvePodNamespacePeer(
|
||
p netv1.NetworkPolicyPeer,
|
||
policyNS string,
|
||
nsByName map[string]Namespace,
|
||
peerPodsByNS map[string][]PeerPod,
|
||
) ([]*net.IPNet, error) {
|
||
var podSel, nsSel labels.Selector
|
||
if p.PodSelector != nil {
|
||
s, err := metav1.LabelSelectorAsSelector(p.PodSelector)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("podSelector: %w", err)
|
||
}
|
||
podSel = s
|
||
}
|
||
if p.NamespaceSelector != nil {
|
||
s, err := metav1.LabelSelectorAsSelector(p.NamespaceSelector)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("namespaceSelector: %w", err)
|
||
}
|
||
nsSel = s
|
||
}
|
||
|
||
// Decide which namespaces are in scope.
|
||
var inScope []string
|
||
if nsSel == nil {
|
||
// Pod-only selector → just the policy's own namespace.
|
||
inScope = []string{policyNS}
|
||
} else {
|
||
for name, ns := range nsByName {
|
||
if nsSel.Matches(labels.Set(ns.Labels)) {
|
||
inScope = append(inScope, name)
|
||
}
|
||
}
|
||
}
|
||
|
||
var out []*net.IPNet
|
||
for _, ns := range inScope {
|
||
for _, pp := range peerPodsByNS[ns] {
|
||
if podSel != nil && !podSel.Matches(labels.Set(pp.Labels)) {
|
||
continue
|
||
}
|
||
for _, ip := range pp.IPs {
|
||
out = append(out, ipToHostCIDR(ip))
|
||
}
|
||
}
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// translatePorts converts NetworkPolicyPort entries into PortMatch.
|
||
//
|
||
// A nil/empty Ports list on a NetworkPolicy rule means "all ports" by
|
||
// spec; we represent that as a single zero-valued PortMatch (any proto,
|
||
// any port) so the renderer can emit a single rule rather than a chain
|
||
// of port-equality matches.
|
||
func translatePorts(ports []netv1.NetworkPolicyPort) ([]PortMatch, error) {
|
||
if len(ports) == 0 {
|
||
return []PortMatch{{}}, nil
|
||
}
|
||
var out []PortMatch
|
||
for i, p := range ports {
|
||
var protoStr string
|
||
if p.Protocol != nil {
|
||
switch *p.Protocol {
|
||
case "TCP":
|
||
protoStr = "tcp"
|
||
case "UDP":
|
||
protoStr = "udp"
|
||
case "SCTP":
|
||
protoStr = "sctp"
|
||
default:
|
||
return nil, fmt.Errorf("port[%d]: protocol %q not supported", i, *p.Protocol)
|
||
}
|
||
} else {
|
||
// Spec default: TCP. We use empty string to mean "any of
|
||
// the three" only when the user explicitly sets neither
|
||
// protocol nor port; here the user has supplied a Port,
|
||
// which implies a protocol — and the spec default is TCP.
|
||
protoStr = "tcp"
|
||
}
|
||
var port, endPort int
|
||
if p.Port != nil {
|
||
if p.Port.Type != 0 { // intstr.Int = 0; intstr.String = 1
|
||
return nil, fmt.Errorf("port[%d]: named ports are not yet supported", i)
|
||
}
|
||
port = int(p.Port.IntVal)
|
||
}
|
||
if p.EndPort != nil {
|
||
endPort = int(*p.EndPort)
|
||
if endPort < port {
|
||
return nil, fmt.Errorf("port[%d]: endPort %d < port %d", i, endPort, port)
|
||
}
|
||
}
|
||
out = append(out, PortMatch{Protocol: protoStr, Port: port, EndPort: endPort})
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// assembleRules emits the cross-product of (one peer-set) × (port list).
|
||
// We currently emit a single Rule per direction since the peer-set is the
|
||
// expensive shared field; ports go inline. allowAll peers result in a
|
||
// rule with no PeerCIDRs, which the renderer treats as "any source".
|
||
func assembleRules(pod Pod, dir Direction, peers peerSet, ports []PortMatch) []Rule {
|
||
if !peers.allowAll && len(peers.CIDRs) == 0 {
|
||
// Selector matched no peers (e.g. podSelector for a label that
|
||
// no live pod has). Emit nothing — the rule cannot allow any
|
||
// real traffic. The pod stays in default-deny for this rule.
|
||
return nil
|
||
}
|
||
r := Rule{
|
||
PodKey: pod.Namespace + "/" + pod.Name,
|
||
HostIface: pod.HostIface,
|
||
PodIPs: append([]net.IP(nil), pod.IPs...),
|
||
Direction: dir,
|
||
Action: ActionAccept,
|
||
Ports: append([]PortMatch(nil), ports...),
|
||
}
|
||
if !peers.allowAll {
|
||
r.PeerCIDRs = append([]*net.IPNet(nil), peers.CIDRs...)
|
||
r.PeerExcept = append([]*net.IPNet(nil), peers.Except...)
|
||
}
|
||
return []Rule{r}
|
||
}
|
||
|
||
// canonicalisePolicies sorts the policy slice by (namespace, name) so the
|
||
// translator's output is deterministic regardless of informer event order.
|
||
func canonicalisePolicies(p []netv1.NetworkPolicy) []netv1.NetworkPolicy {
|
||
out := append([]netv1.NetworkPolicy(nil), p...)
|
||
sort.Slice(out, func(i, j int) bool {
|
||
if out[i].Namespace != out[j].Namespace {
|
||
return out[i].Namespace < out[j].Namespace
|
||
}
|
||
return out[i].Name < out[j].Name
|
||
})
|
||
return out
|
||
}
|
||
|
||
func indexNamespaces(nss []Namespace) map[string]Namespace {
|
||
out := make(map[string]Namespace, len(nss))
|
||
for _, ns := range nss {
|
||
out[ns.Name] = ns
|
||
}
|
||
return out
|
||
}
|
||
|
||
func indexPeerPods(pods []PeerPod) map[string][]PeerPod {
|
||
out := map[string][]PeerPod{}
|
||
for _, p := range pods {
|
||
out[p.Namespace] = append(out[p.Namespace], p)
|
||
}
|
||
// Sort each namespace's pod list by (name) so the translator's IP
|
||
// ordering is stable.
|
||
for k := range out {
|
||
sort.Slice(out[k], func(i, j int) bool { return out[k][i].Name < out[k][j].Name })
|
||
}
|
||
return out
|
||
}
|
||
|
||
// ipToHostCIDR returns ip/32 (v4) or ip/128 (v6) — the smallest CIDR
|
||
// covering exactly that one address.
|
||
func ipToHostCIDR(ip net.IP) *net.IPNet {
|
||
if v4 := ip.To4(); v4 != nil {
|
||
return &net.IPNet{IP: v4, Mask: net.CIDRMask(32, 32)}
|
||
}
|
||
return &net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(128, 128)}
|
||
}
|