package netpol import ( "fmt" "hash/fnv" "net" "sort" "strings" ) // Render produces an nftables script that, when applied with `nft -f -`, // installs the desired NetworkPolicy enforcement state for this node. // // Layout: // // table inet flock_netpol { // chain forward { # base chain on hook forward // type filter hook forward priority filter; policy accept; // # one jump per (pod, direction) that has rules and/or isolation // iifname "flock1a2b3c4d" ip6 saddr 2001:db8::1 jump pod__egress // oifname "flock1a2b3c4d" ip6 daddr 2001:db8::1 jump pod__ingress // } // chain pod__ingress { # one per isolated direction // # explicit allow lines (empty for default-deny) // drop // } // chain pod__egress { ... } // } // // The whole table is replaced atomically: a "delete table … 2>/dev/null" // (best-effort) followed by an "add table" + the chains. nft executes the // script as a single transaction; partial application is impossible. // // Output is deterministic: equal Output → byte-identical script. The // reconciler relies on this for de-dup. func Render(out Output) string { var sb strings.Builder sb.WriteString("# Generated by flock-agent netpol; do not edit by hand.\n") // Best-effort delete; if the table doesn't exist (first run) nft // returns an error, hence the redirect. The "add table" then // recreates everything. sb.WriteString("destroy table inet flock_netpol\n") sb.WriteString("table inet flock_netpol {\n") // Build per-(pod, direction) chains. We need them defined BEFORE the // base chain references them, so we render chains first. chains := buildChains(out) for _, c := range chains { writeChain(&sb, c) } // Base chain emits jumps in a stable order (chain name asc). sb.WriteString("\tchain forward {\n") sb.WriteString("\t\ttype filter hook forward priority filter; policy accept;\n") for _, c := range chains { writeBaseJump(&sb, c) } sb.WriteString("\t}\n") sb.WriteString("}\n") return sb.String() } // chain is one rendered chain — one direction of one pod. type chain struct { name string // pod__ingress / _egress hostIface string podIPs []net.IP direction Direction rules []Rule policy string // "drop" or "accept" } // buildChains groups rules by (PodKey, Direction) and adds default-deny // chains for isolated directions that received no explicit rules. func buildChains(out Output) []chain { type key struct { podKey string dir Direction } byKey := map[key]*chain{} // Seed isolated directions with empty chains so default-deny lands // even when no explicit allow rule was emitted for them. for iso := range out.Isolated { byKey[key{podKey: iso.PodKey, dir: iso.Direction}] = &chain{ direction: iso.Direction, policy: "drop", } } // Append rules into their chain. Rule.PodIPs and HostIface are // authoritative — every rule for a given pod carries the same values // (translator invariant), so we copy from the first. for _, r := range out.Rules { k := key{podKey: r.PodKey, dir: r.Direction} c := byKey[k] if c == nil { // Rule for a non-isolated direction shouldn't happen in // practice (translator only emits rules for selected pods) // but be tolerant — the chain just gets policy accept. c = &chain{direction: r.Direction, policy: "accept"} byKey[k] = c } c.rules = append(c.rules, r) if c.hostIface == "" { c.hostIface = r.HostIface c.podIPs = append([]net.IP(nil), r.PodIPs...) } } // If a chain was created from Isolated only (no rules), look up the // pod's HostIface + IPs from Output.Pods. This is the path a // default-deny policy takes — no allow rules, only isolation. for k, c := range byKey { if c.hostIface != "" { continue } if lp, ok := out.Pods[k.podKey]; ok { c.hostIface = lp.HostIface c.podIPs = append([]net.IP(nil), lp.IPs...) continue } // Last resort: lift from any rule sharing the PodKey. Should // not normally happen — the translator populates Pods for every // isolated pod — but defends against partially-populated Output // values constructed by tests. for _, r := range out.Rules { if r.PodKey == k.podKey { c.hostIface = r.HostIface c.podIPs = append([]net.IP(nil), r.PodIPs...) break } } } // Materialise chain names and emit in deterministic order. var chains []chain for k, c := range byKey { if c.hostIface == "" { continue // can't jump to it; skip } c.name = chainName(k.podKey, c.direction) chains = append(chains, *c) } sort.Slice(chains, func(i, j int) bool { return chains[i].name < chains[j].name }) return chains } // chainName produces a stable, name-safe chain identifier. Pod keys can // contain characters nft doesn't allow in identifiers, so we hash them. // Direction keeps ingress and egress separate. func chainName(podKey string, dir Direction) string { h := fnv.New64a() _, _ = h.Write([]byte(podKey)) return fmt.Sprintf("pod_%016x_%s", h.Sum64(), dir) } // writeChain emits the chain definition. Empty chains exist deliberately: // the chain's drop policy IS the default-deny. func writeChain(sb *strings.Builder, c chain) { fmt.Fprintf(sb, "\tchain %s {\n", c.name) for _, r := range c.rules { writeAllowRule(sb, r) } if c.policy == "drop" { sb.WriteString("\t\tdrop\n") } sb.WriteString("\t}\n") } // writeAllowRule emits one accept line: // // [ip|ip6 saddr {peers}] [ip|ip6 saddr != {except}] [proto dport {port|port-end}] accept // // The saddr / daddr field flips based on direction (ingress = from peer → // match saddr; egress = to peer → match daddr). func writeAllowRule(sb *strings.Builder, r Rule) { v6Peers, v4Peers := splitFamily(r.PeerCIDRs) v6Except, v4Except := splitFamily(r.PeerExcept) v6Pod, v4Pod := splitIPFamily(r.PodIPs) hasPeerFilter := len(r.PeerCIDRs) > 0 emit := func(family string, peers, except []*net.IPNet, podIP net.IP) { if hasPeerFilter && len(peers) == 0 && len(except) == 0 { // Peer filter exists but no entries of this family — rule // must not match anything for this family. return } if podIP == nil { // Pod has no address of this family; nothing to guard. return } for _, port := range r.Ports { sb.WriteString("\t\t") // Peer (saddr/daddr) match: address is "peer's address", // which is saddr on ingress and daddr on egress. peerField := peerAddrField(family, r.Direction) if hasPeerFilter && len(peers) > 0 { fmt.Fprintf(sb, "%s { %s } ", peerField, joinCIDRs(peers)) } if hasPeerFilter && len(except) > 0 { fmt.Fprintf(sb, "%s != { %s } ", peerField, joinCIDRs(except)) } // Port match. writePortMatch(sb, port) fmt.Fprintf(sb, "%s\n", r.Action) } } emit("ip6", v6Peers, v6Except, v6Pod) emit("ip", v4Peers, v4Except, v4Pod) } // peerAddrField returns "ip6 saddr" / "ip saddr" / "ip6 daddr" / "ip daddr" // depending on family + direction. Ingress matches the peer as the source; // egress matches the peer as the destination. func peerAddrField(family string, dir Direction) string { switch { case dir == DirIngress: return family + " saddr" default: return family + " daddr" } } // writePortMatch appends "tcp dport 80 " (single port) or // "tcp dport 8000-8999 " (range), or nothing when port is "any". func writePortMatch(sb *strings.Builder, p PortMatch) { if p.Port == 0 && p.Protocol == "" { return } proto := p.Protocol if proto == "" { proto = "tcp" } if p.Port == 0 { // Protocol-only match. nft has `meta l4proto tcp`. fmt.Fprintf(sb, "meta l4proto %s ", proto) return } if p.EndPort > p.Port { fmt.Fprintf(sb, "%s dport %d-%d ", proto, p.Port, p.EndPort) return } fmt.Fprintf(sb, "%s dport %d ", proto, p.Port) } // writeBaseJump emits one line per (pod, direction) chain in the base // `forward` chain. The match is anchored on the host-side veth name so // the rule only fires for traffic that genuinely crosses this pod's veth. // // We additionally constrain on the pod's address (saddr for egress, daddr // for ingress) so a packet that somehow hits the wrong veth — e.g. during // a CNI ADD race — won't be policy-evaluated against the wrong pod. func writeBaseJump(sb *strings.Builder, c chain) { v6, v4 := splitIPFamily(c.podIPs) emit := func(family string, ip net.IP) { if ip == nil { return } var iface, addrField, addrStr string if c.direction == DirEgress { iface = "iifname" addrField = family + " saddr" } else { iface = "oifname" addrField = family + " daddr" } if family == "ip" { addrStr = ip.To4().String() } else { addrStr = ip.To16().String() } fmt.Fprintf(sb, "\t\t%s \"%s\" %s %s jump %s\n", iface, c.hostIface, addrField, addrStr, c.name) } emit("ip6", v6) emit("ip", v4) } // splitFamily partitions CIDRs into (v6, v4) lists, preserving order // within each family. func splitFamily(cs []*net.IPNet) ([]*net.IPNet, []*net.IPNet) { var v6, v4 []*net.IPNet for _, c := range cs { if c.IP.To4() != nil { v4 = append(v4, c) } else { v6 = append(v6, c) } } return v6, v4 } // splitIPFamily picks one v6 and one v4 from a list of pod IPs (a pod has // at most one of each in flock's model). func splitIPFamily(ips []net.IP) (v6, v4 net.IP) { for _, ip := range ips { if ip == nil { continue } if ip.To4() != nil { if v4 == nil { v4 = ip } } else { if v6 == nil { v6 = ip } } } return } func joinCIDRs(cs []*net.IPNet) string { parts := make([]string, len(cs)) for i, c := range cs { parts[i] = c.String() } sort.Strings(parts) return strings.Join(parts, ", ") }