e9d3eef2cc
Build flock Image / build (push) Has been cancelled
K8s NetworkPolicy applies to the start of new connections; reply packets for established flows (and ICMP related) must not be matched against the explicit allow set. The pod ingress chain previously had only explicit dport allows + a final drop, so any reply to a pod-initiated outbound where the reply's dport (the ephemeral source port) wasn't in the allow set got dropped. Hit in production 2026-04-26: garage's `garage-admin-restrict` NP allowed dports 3900/80/3901/3903 only. Garage uses kubernetes_discovery to find peers — outbound to kube-apiserver succeeded, replies returned to ephemeral source ports, dropped → "Layout not ready" cluster-wide. Fix: emit `ct state established,related accept` as the first rule in every pod_<hash>_(ingress|egress) chain. Regression test added. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
316 lines
9.6 KiB
Go
316 lines
9.6 KiB
Go
package netpol
|
|
|
|
import (
|
|
"fmt"
|
|
"hash/fnv"
|
|
"net"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// Render produces an nftables script that, when applied with `nft -f -`,
|
|
// installs the desired NetworkPolicy enforcement state for this node.
|
|
//
|
|
// Layout:
|
|
//
|
|
// table inet flock_netpol {
|
|
// chain forward { # base chain on hook forward
|
|
// type filter hook forward priority filter; policy accept;
|
|
// # one jump per (pod, direction) that has rules and/or isolation
|
|
// iifname "flock1a2b3c4d" ip6 saddr 2001:db8::1 jump pod_<hash>_egress
|
|
// oifname "flock1a2b3c4d" ip6 daddr 2001:db8::1 jump pod_<hash>_ingress
|
|
// }
|
|
// chain pod_<hash>_ingress { # one per isolated direction
|
|
// # explicit allow lines (empty for default-deny)
|
|
// drop
|
|
// }
|
|
// chain pod_<hash>_egress { ... }
|
|
// }
|
|
//
|
|
// The whole table is replaced atomically: a "delete table … 2>/dev/null"
|
|
// (best-effort) followed by an "add table" + the chains. nft executes the
|
|
// script as a single transaction; partial application is impossible.
|
|
//
|
|
// Output is deterministic: equal Output → byte-identical script. The
|
|
// reconciler relies on this for de-dup.
|
|
func Render(out Output) string {
|
|
var sb strings.Builder
|
|
|
|
sb.WriteString("# Generated by flock-agent netpol; do not edit by hand.\n")
|
|
// Best-effort delete; if the table doesn't exist (first run) nft
|
|
// returns an error, hence the redirect. The "add table" then
|
|
// recreates everything.
|
|
sb.WriteString("destroy table inet flock_netpol\n")
|
|
sb.WriteString("table inet flock_netpol {\n")
|
|
|
|
// Build per-(pod, direction) chains. We need them defined BEFORE the
|
|
// base chain references them, so we render chains first.
|
|
chains := buildChains(out)
|
|
for _, c := range chains {
|
|
writeChain(&sb, c)
|
|
}
|
|
|
|
// Base chain emits jumps in a stable order (chain name asc).
|
|
sb.WriteString("\tchain forward {\n")
|
|
sb.WriteString("\t\ttype filter hook forward priority filter; policy accept;\n")
|
|
for _, c := range chains {
|
|
writeBaseJump(&sb, c)
|
|
}
|
|
sb.WriteString("\t}\n")
|
|
|
|
sb.WriteString("}\n")
|
|
return sb.String()
|
|
}
|
|
|
|
// chain is one rendered chain — one direction of one pod.
|
|
type chain struct {
|
|
name string // pod_<hash>_ingress / _egress
|
|
hostIface string
|
|
podIPs []net.IP
|
|
direction Direction
|
|
rules []Rule
|
|
policy string // "drop" or "accept"
|
|
}
|
|
|
|
// buildChains groups rules by (PodKey, Direction) and adds default-deny
|
|
// chains for isolated directions that received no explicit rules.
|
|
func buildChains(out Output) []chain {
|
|
type key struct {
|
|
podKey string
|
|
dir Direction
|
|
}
|
|
byKey := map[key]*chain{}
|
|
|
|
// Seed isolated directions with empty chains so default-deny lands
|
|
// even when no explicit allow rule was emitted for them.
|
|
for iso := range out.Isolated {
|
|
byKey[key{podKey: iso.PodKey, dir: iso.Direction}] = &chain{
|
|
direction: iso.Direction,
|
|
policy: "drop",
|
|
}
|
|
}
|
|
|
|
// Append rules into their chain. Rule.PodIPs and HostIface are
|
|
// authoritative — every rule for a given pod carries the same values
|
|
// (translator invariant), so we copy from the first.
|
|
for _, r := range out.Rules {
|
|
k := key{podKey: r.PodKey, dir: r.Direction}
|
|
c := byKey[k]
|
|
if c == nil {
|
|
// Rule for a non-isolated direction shouldn't happen in
|
|
// practice (translator only emits rules for selected pods)
|
|
// but be tolerant — the chain just gets policy accept.
|
|
c = &chain{direction: r.Direction, policy: "accept"}
|
|
byKey[k] = c
|
|
}
|
|
c.rules = append(c.rules, r)
|
|
if c.hostIface == "" {
|
|
c.hostIface = r.HostIface
|
|
c.podIPs = append([]net.IP(nil), r.PodIPs...)
|
|
}
|
|
}
|
|
|
|
// If a chain was created from Isolated only (no rules), look up the
|
|
// pod's HostIface + IPs from Output.Pods. This is the path a
|
|
// default-deny policy takes — no allow rules, only isolation.
|
|
for k, c := range byKey {
|
|
if c.hostIface != "" {
|
|
continue
|
|
}
|
|
if lp, ok := out.Pods[k.podKey]; ok {
|
|
c.hostIface = lp.HostIface
|
|
c.podIPs = append([]net.IP(nil), lp.IPs...)
|
|
continue
|
|
}
|
|
// Last resort: lift from any rule sharing the PodKey. Should
|
|
// not normally happen — the translator populates Pods for every
|
|
// isolated pod — but defends against partially-populated Output
|
|
// values constructed by tests.
|
|
for _, r := range out.Rules {
|
|
if r.PodKey == k.podKey {
|
|
c.hostIface = r.HostIface
|
|
c.podIPs = append([]net.IP(nil), r.PodIPs...)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Materialise chain names and emit in deterministic order.
|
|
var chains []chain
|
|
for k, c := range byKey {
|
|
if c.hostIface == "" {
|
|
continue // can't jump to it; skip
|
|
}
|
|
c.name = chainName(k.podKey, c.direction)
|
|
chains = append(chains, *c)
|
|
}
|
|
sort.Slice(chains, func(i, j int) bool { return chains[i].name < chains[j].name })
|
|
return chains
|
|
}
|
|
|
|
// chainName produces a stable, name-safe chain identifier. Pod keys can
|
|
// contain characters nft doesn't allow in identifiers, so we hash them.
|
|
// Direction keeps ingress and egress separate.
|
|
func chainName(podKey string, dir Direction) string {
|
|
h := fnv.New64a()
|
|
_, _ = h.Write([]byte(podKey))
|
|
return fmt.Sprintf("pod_%016x_%s", h.Sum64(), dir)
|
|
}
|
|
|
|
// writeChain emits the chain definition. Empty chains exist deliberately:
|
|
// the chain's drop policy IS the default-deny.
|
|
func writeChain(sb *strings.Builder, c chain) {
|
|
fmt.Fprintf(sb, "\tchain %s {\n", c.name)
|
|
// Stateful accept for return traffic. NetworkPolicy applies to the
|
|
// start of a new connection — reply packets for pod-initiated flows
|
|
// (egress) and follow-up packets of an established ingress flow must
|
|
// pass regardless of the explicit allow set, otherwise the chain's
|
|
// final drop kills ephemeral-port replies (e.g. pod → kube-apiserver).
|
|
sb.WriteString("\t\tct state established,related accept\n")
|
|
for _, r := range c.rules {
|
|
writeAllowRule(sb, r)
|
|
}
|
|
if c.policy == "drop" {
|
|
sb.WriteString("\t\tdrop\n")
|
|
}
|
|
sb.WriteString("\t}\n")
|
|
}
|
|
|
|
// writeAllowRule emits one accept line:
|
|
//
|
|
// [ip|ip6 saddr {peers}] [ip|ip6 saddr != {except}] [proto dport {port|port-end}] accept
|
|
//
|
|
// The saddr / daddr field flips based on direction (ingress = from peer →
|
|
// match saddr; egress = to peer → match daddr).
|
|
func writeAllowRule(sb *strings.Builder, r Rule) {
|
|
v6Peers, v4Peers := splitFamily(r.PeerCIDRs)
|
|
v6Except, v4Except := splitFamily(r.PeerExcept)
|
|
v6Pod, v4Pod := splitIPFamily(r.PodIPs)
|
|
hasPeerFilter := len(r.PeerCIDRs) > 0
|
|
|
|
emit := func(family string, peers, except []*net.IPNet, podIP net.IP) {
|
|
if hasPeerFilter && len(peers) == 0 && len(except) == 0 {
|
|
// Peer filter exists but no entries of this family — rule
|
|
// must not match anything for this family.
|
|
return
|
|
}
|
|
if podIP == nil {
|
|
// Pod has no address of this family; nothing to guard.
|
|
return
|
|
}
|
|
for _, port := range r.Ports {
|
|
sb.WriteString("\t\t")
|
|
// Peer (saddr/daddr) match: address is "peer's address",
|
|
// which is saddr on ingress and daddr on egress.
|
|
peerField := peerAddrField(family, r.Direction)
|
|
if hasPeerFilter && len(peers) > 0 {
|
|
fmt.Fprintf(sb, "%s { %s } ", peerField, joinCIDRs(peers))
|
|
}
|
|
if hasPeerFilter && len(except) > 0 {
|
|
fmt.Fprintf(sb, "%s != { %s } ", peerField, joinCIDRs(except))
|
|
}
|
|
// Port match.
|
|
writePortMatch(sb, port)
|
|
fmt.Fprintf(sb, "%s\n", r.Action)
|
|
}
|
|
}
|
|
emit("ip6", v6Peers, v6Except, v6Pod)
|
|
emit("ip", v4Peers, v4Except, v4Pod)
|
|
}
|
|
|
|
// peerAddrField returns "ip6 saddr" / "ip saddr" / "ip6 daddr" / "ip daddr"
|
|
// depending on family + direction. Ingress matches the peer as the source;
|
|
// egress matches the peer as the destination.
|
|
func peerAddrField(family string, dir Direction) string {
|
|
switch {
|
|
case dir == DirIngress:
|
|
return family + " saddr"
|
|
default:
|
|
return family + " daddr"
|
|
}
|
|
}
|
|
|
|
// writePortMatch appends "tcp dport 80 " (single port) or
|
|
// "tcp dport 8000-8999 " (range), or nothing when port is "any".
|
|
func writePortMatch(sb *strings.Builder, p PortMatch) {
|
|
if p.Port == 0 && p.Protocol == "" {
|
|
return
|
|
}
|
|
proto := p.Protocol
|
|
if proto == "" {
|
|
proto = "tcp"
|
|
}
|
|
if p.Port == 0 {
|
|
// Protocol-only match. nft has `meta l4proto tcp`.
|
|
fmt.Fprintf(sb, "meta l4proto %s ", proto)
|
|
return
|
|
}
|
|
if p.EndPort > p.Port {
|
|
fmt.Fprintf(sb, "%s dport %d-%d ", proto, p.Port, p.EndPort)
|
|
return
|
|
}
|
|
fmt.Fprintf(sb, "%s dport %d ", proto, p.Port)
|
|
}
|
|
|
|
// writeBaseJump emits one line per (pod, direction) chain in the base
|
|
// `forward` chain. The match is anchored on the host-side veth name —
|
|
// the veth uniquely belongs to one pod, so anything traversing it is
|
|
// to/from that pod by definition.
|
|
//
|
|
// We deliberately don't filter on the pod's eth0 address: the pod can
|
|
// also receive traffic addressed to its anycast IP (or any other host
|
|
// route the operator has installed via flock-agent), and policy must
|
|
// apply uniformly to all of it.
|
|
func writeBaseJump(sb *strings.Builder, c chain) {
|
|
var iface string
|
|
if c.direction == DirEgress {
|
|
iface = "iifname"
|
|
} else {
|
|
iface = "oifname"
|
|
}
|
|
fmt.Fprintf(sb, "\t\t%s \"%s\" jump %s\n", iface, c.hostIface, c.name)
|
|
}
|
|
|
|
// splitFamily partitions CIDRs into (v6, v4) lists, preserving order
|
|
// within each family.
|
|
func splitFamily(cs []*net.IPNet) ([]*net.IPNet, []*net.IPNet) {
|
|
var v6, v4 []*net.IPNet
|
|
for _, c := range cs {
|
|
if c.IP.To4() != nil {
|
|
v4 = append(v4, c)
|
|
} else {
|
|
v6 = append(v6, c)
|
|
}
|
|
}
|
|
return v6, v4
|
|
}
|
|
|
|
// splitIPFamily picks one v6 and one v4 from a list of pod IPs (a pod has
|
|
// at most one of each in flock's model).
|
|
func splitIPFamily(ips []net.IP) (v6, v4 net.IP) {
|
|
for _, ip := range ips {
|
|
if ip == nil {
|
|
continue
|
|
}
|
|
if ip.To4() != nil {
|
|
if v4 == nil {
|
|
v4 = ip
|
|
}
|
|
} else {
|
|
if v6 == nil {
|
|
v6 = ip
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func joinCIDRs(cs []*net.IPNet) string {
|
|
parts := make([]string, len(cs))
|
|
for i, c := range cs {
|
|
parts[i] = c.String()
|
|
}
|
|
sort.Strings(parts)
|
|
return strings.Join(parts, ", ")
|
|
}
|