a6202a36bd
Build flock Image / build (push) Has been cancelled
BuiltinFamilyDefaults() now returns {WantV6: true, WantV4: true}. Pods
that want a single family explicitly opt out via the
flock.fritzlab.net/ipv4 (or ipv6) annotation, or the operator narrows
the default at the node level via NodeConfig.Spec.Defaults.
Annotation precedence is unchanged: pod annotation > NodeConfig defaults
> built-in baseline. Tests updated to reflect the new baseline; the
"opt out of v4" path now has explicit coverage.
Docs updated:
- NodeConfig.Spec.Defaults Go doc + CRD descriptions reflect the new
baseline and its overrides
- README opening framing softened from "IPv6-first" to "dual-stack,
IPv6-friendly"; example pods + spec.defaults table flipped to
treat dual-stack as the default and v6/v4-only as overrides
- README NetworkPolicy line in the comparison table flipped to
"yes (nftables)" since v1 enforcement shipped
- Limitations note about IPv4-only destinations rewritten — every
pod has v4 by default now, so the question is whether your IPv4
pool is routable beyond your network
Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
402 lines
11 KiB
Go
402 lines
11 KiB
Go
package agent
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"fmt"
|
|
"math/big"
|
|
"net"
|
|
"sync"
|
|
|
|
"code.fritzlab.net/fritzlab/flock/pkg/embed"
|
|
)
|
|
|
|
// IPAM allocates per-pod IPv6 and IPv4 addresses from the NodeConfig CIDRs.
|
|
//
|
|
// Invariants:
|
|
// - Concurrent callers serialize through mu.
|
|
// - Allocate() never returns an address already marked in-use. Collisions
|
|
// trigger retry (IPv6: up to 16 regenerations of the random N nibble or
|
|
// random IID; IPv4: linear scan of the effective /24 or smaller).
|
|
// - Release is idempotent.
|
|
//
|
|
// IPAM is constructed per-agent; it holds no durable state. The caller is
|
|
// responsible for marking pre-existing committed allocations in-use via
|
|
// MarkInUse during startup reconciliation.
|
|
type IPAM struct {
|
|
mu sync.Mutex
|
|
cidr6 []*net.IPNet
|
|
cidr4 []*net.IPNet
|
|
used map[string]struct{} // canonical IP strings
|
|
randSrc randSource
|
|
}
|
|
|
|
// randSource is injectable so tests can use a deterministic source.
|
|
type randSource interface {
|
|
// NibbleN returns a random 4-bit value.
|
|
NibbleN() byte
|
|
// FillIID fills dst with random bytes.
|
|
FillIID(dst []byte)
|
|
// PickIndex returns a pseudo-random int in [0, n). Tests may pin this.
|
|
PickIndex(n int) int
|
|
}
|
|
|
|
// cryptoRand uses crypto/rand for real allocations.
|
|
type cryptoRand struct{}
|
|
|
|
func (cryptoRand) NibbleN() byte {
|
|
var b [1]byte
|
|
_, _ = rand.Read(b[:])
|
|
return b[0] & 0x0F
|
|
}
|
|
|
|
func (cryptoRand) FillIID(dst []byte) {
|
|
_, _ = rand.Read(dst)
|
|
}
|
|
|
|
func (cryptoRand) PickIndex(n int) int {
|
|
if n <= 1 {
|
|
return 0
|
|
}
|
|
big_, _ := rand.Int(rand.Reader, big.NewInt(int64(n)))
|
|
return int(big_.Int64())
|
|
}
|
|
|
|
// AllocRequest describes a pending allocation. Values come from Pod metadata
|
|
// + annotations at CNI ADD time, with per-node FamilyDefaults already merged
|
|
// in (see ParseAnnotations).
|
|
type AllocRequest struct {
|
|
ContainerID string
|
|
Namespace string
|
|
Pod string
|
|
// WantV6 / WantV4 are the post-merge address family selection (pod
|
|
// annotation > NodeConfig.Spec.Defaults > built-in baseline of
|
|
// dual-stack). At least one MUST be true; Allocate rejects the request
|
|
// otherwise.
|
|
WantV6 bool
|
|
WantV4 bool
|
|
// AnnCIDR6 / AnnCIDR4 come from the cidr6 / cidr4 annotations. Empty
|
|
// means "use any of the node's CIDRs".
|
|
AnnCIDR6 []*net.IPNet
|
|
AnnCIDR4 []*net.IPNet
|
|
// IPAlgo comes from the ip-algo annotation. Empty means random IID.
|
|
IPAlgo []embed.Field
|
|
// ImageDigest is the sha256 manifest digest (with or without "sha256:"
|
|
// prefix). If empty, embed.Values.ImageFallback = ContainerID is used
|
|
// for ip-algo fields that reference image.
|
|
ImageDigest string
|
|
}
|
|
|
|
// AllocResult is what the IPAM hands back to the CNI ADD.
|
|
type AllocResult struct {
|
|
IP6 net.IP // nil if WantV6 was false
|
|
IP4 net.IP
|
|
}
|
|
|
|
// NewIPAM constructs an IPAM seeded from NodeConfig CIDRs.
|
|
func NewIPAM(cidr6, cidr4 []string) (*IPAM, error) {
|
|
i := &IPAM{
|
|
used: map[string]struct{}{},
|
|
randSrc: cryptoRand{},
|
|
}
|
|
for _, s := range cidr6 {
|
|
_, n, err := net.ParseCIDR(s)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("cidr6 %q: %w", s, err)
|
|
}
|
|
if n.IP.To4() != nil {
|
|
return nil, fmt.Errorf("cidr6 %q is IPv4", s)
|
|
}
|
|
i.cidr6 = append(i.cidr6, n)
|
|
}
|
|
for _, s := range cidr4 {
|
|
_, n, err := net.ParseCIDR(s)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("cidr4 %q: %w", s, err)
|
|
}
|
|
if n.IP.To4() == nil {
|
|
return nil, fmt.Errorf("cidr4 %q is not IPv4", s)
|
|
}
|
|
i.cidr4 = append(i.cidr4, n)
|
|
}
|
|
return i, nil
|
|
}
|
|
|
|
// MarkInUse records that `ip` is already taken. Used during startup
|
|
// reconciliation to seed the allocator from committed state.
|
|
func (i *IPAM) MarkInUse(ip net.IP) {
|
|
if ip == nil {
|
|
return
|
|
}
|
|
i.mu.Lock()
|
|
i.used[canonical(ip)] = struct{}{}
|
|
i.mu.Unlock()
|
|
}
|
|
|
|
// Release marks one or more IPs as free. Safe to call for IPs not in use.
|
|
func (i *IPAM) Release(ips ...net.IP) {
|
|
i.mu.Lock()
|
|
defer i.mu.Unlock()
|
|
for _, ip := range ips {
|
|
if ip != nil {
|
|
delete(i.used, canonical(ip))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Allocate resolves effective CIDRs from the request + node CIDRs, picks
|
|
// addresses, and records them as in-use. It is atomic with respect to other
|
|
// Allocate / Release / MarkInUse calls.
|
|
func (i *IPAM) Allocate(req AllocRequest) (AllocResult, error) {
|
|
if !req.WantV6 && !req.WantV4 {
|
|
return AllocResult{}, fmt.Errorf("pod %s/%s: at least one of ipv6/ipv4 must be true", req.Namespace, req.Pod)
|
|
}
|
|
|
|
i.mu.Lock()
|
|
defer i.mu.Unlock()
|
|
|
|
var out AllocResult
|
|
var claimed []net.IP
|
|
|
|
rollback := func() {
|
|
for _, ip := range claimed {
|
|
delete(i.used, canonical(ip))
|
|
}
|
|
}
|
|
|
|
if req.WantV6 {
|
|
eff, err := resolveEffective(req.AnnCIDR6, i.cidr6)
|
|
if err != nil {
|
|
return AllocResult{}, fmt.Errorf("pod %s/%s cidr6: %w", req.Namespace, req.Pod, err)
|
|
}
|
|
cidr := eff[i.randSrc.PickIndex(len(eff))]
|
|
ip, err := i.allocV6(cidr, req)
|
|
if err != nil {
|
|
rollback()
|
|
return AllocResult{}, err
|
|
}
|
|
i.used[canonical(ip)] = struct{}{}
|
|
claimed = append(claimed, ip)
|
|
out.IP6 = ip
|
|
}
|
|
|
|
if req.WantV4 {
|
|
eff, err := resolveEffective(req.AnnCIDR4, i.cidr4)
|
|
if err != nil {
|
|
rollback()
|
|
return AllocResult{}, fmt.Errorf("pod %s/%s cidr4: %w", req.Namespace, req.Pod, err)
|
|
}
|
|
cidr := eff[i.randSrc.PickIndex(len(eff))]
|
|
ip, err := i.allocV4(cidr)
|
|
if err != nil {
|
|
rollback()
|
|
return AllocResult{}, err
|
|
}
|
|
i.used[canonical(ip)] = struct{}{}
|
|
out.IP4 = ip
|
|
}
|
|
|
|
return out, nil
|
|
}
|
|
|
|
// allocV6 picks an IPv6 /128 from the given CIDR, retrying on collision.
|
|
// Caller holds i.mu.
|
|
func (i *IPAM) allocV6(cidr *net.IPNet, req AllocRequest) (net.IP, error) {
|
|
const maxAttempts = 16
|
|
for attempt := 0; attempt < maxAttempts; attempt++ {
|
|
var ip net.IP
|
|
var err error
|
|
if len(req.IPAlgo) == 0 {
|
|
ip, err = i.randomV6(cidr)
|
|
} else {
|
|
ip, err = embed.Embed(cidr, req.IPAlgo, embed.Values{
|
|
Namespace: req.Namespace,
|
|
Pod: req.Pod,
|
|
Image: req.ImageDigest,
|
|
ImageFallback: req.ContainerID,
|
|
}, i.randSrc.NibbleN())
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if _, clash := i.used[canonical(ip)]; !clash {
|
|
return ip, nil
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("IPv6 allocation: %d collisions in %s — giving up", maxAttempts, cidr)
|
|
}
|
|
|
|
// randomV6 picks a random /128 inside cidr. The network prefix bits are
|
|
// preserved from cidr.IP; the host bits are filled from the random source.
|
|
//
|
|
// Implementation: walk the 16 IPv6 bytes once. For each byte we ask whether
|
|
// it's entirely inside the network mask (skip), entirely inside the host
|
|
// portion (overwrite with random), or split (combine bits from both).
|
|
func (i *IPAM) randomV6(cidr *net.IPNet) (net.IP, error) {
|
|
ones, bits := cidr.Mask.Size()
|
|
if bits != 128 {
|
|
return nil, fmt.Errorf("cidr %s is not IPv6", cidr)
|
|
}
|
|
out := make(net.IP, net.IPv6len)
|
|
copy(out, cidr.IP.To16())
|
|
rnd := make([]byte, net.IPv6len)
|
|
i.randSrc.FillIID(rnd)
|
|
for b := 0; b < net.IPv6len; b++ {
|
|
byteStart := b * 8
|
|
byteEnd := byteStart + 8
|
|
switch {
|
|
case byteEnd <= ones:
|
|
// Entirely inside the network prefix — leave untouched.
|
|
continue
|
|
case byteStart >= ones:
|
|
// Entirely inside the host portion — fully randomise.
|
|
out[b] = rnd[b]
|
|
default:
|
|
// Split byte: top (ones-byteStart) bits are network, rest host.
|
|
networkBits := ones - byteStart
|
|
hostMask := byte(0xFF) >> uint(networkBits)
|
|
out[b] = (out[b] & ^hostMask) | (rnd[b] & hostMask)
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// allocV4 walks the CIDR linearly skipping network + broadcast addresses.
|
|
// Caller holds i.mu.
|
|
func (i *IPAM) allocV4(cidr *net.IPNet) (net.IP, error) {
|
|
ones, _ := cidr.Mask.Size()
|
|
total := uint64(1) << uint(32-ones)
|
|
if total < 4 {
|
|
return nil, fmt.Errorf("cidr %s has no usable host space", cidr)
|
|
}
|
|
base := ipToU32(cidr.IP.To4())
|
|
// Skip .0 (network) and .1 (reserved for gateway / routing convention) up to .<broadcast-1>.
|
|
for off := uint64(2); off < total-1; off++ {
|
|
ip := u32ToIP(base + uint32(off))
|
|
if _, clash := i.used[canonical(ip)]; !clash {
|
|
return ip, nil
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("IPv4 allocation: %s exhausted", cidr)
|
|
}
|
|
|
|
// resolveEffective applies the cidr6/cidr4 annotation → node CIDR intersection
|
|
// rules from the design doc. Returns the list of CIDRs the allocator may
|
|
// actually allocate from. No intersection → error.
|
|
//
|
|
// Rules (from dfritz-cni.md):
|
|
//
|
|
// annCIDR == nodeCIDR → allocate from nodeCIDR
|
|
// annCIDR supernet of → allocate from the more specific nodeCIDR
|
|
// annCIDR subnet of → allocate from annCIDR (more restrictive)
|
|
// no overlap → caller error
|
|
//
|
|
// If annCIDRs is empty, all nodeCIDRs are eligible.
|
|
// If annCIDRs has multiple entries, the *first* that intersects any
|
|
// nodeCIDR wins, matching the design-doc phrasing "the agent uses the first
|
|
// one that intersects a node CIDR".
|
|
func resolveEffective(annCIDRs []*net.IPNet, nodeCIDRs []*net.IPNet) ([]*net.IPNet, error) {
|
|
if len(nodeCIDRs) == 0 {
|
|
return nil, fmt.Errorf("node has no CIDRs configured for this family")
|
|
}
|
|
if len(annCIDRs) == 0 {
|
|
return nodeCIDRs, nil
|
|
}
|
|
for _, ann := range annCIDRs {
|
|
var matches []*net.IPNet
|
|
for _, node := range nodeCIDRs {
|
|
m := intersectCIDR(ann, node)
|
|
if m != nil {
|
|
matches = append(matches, m)
|
|
}
|
|
}
|
|
if len(matches) > 0 {
|
|
return matches, nil
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("annotation CIDRs %v do not intersect any node CIDR %v",
|
|
toStringSlice(annCIDRs), toStringSlice(nodeCIDRs))
|
|
}
|
|
|
|
// intersectCIDR returns the effective allocation range between an annotation
|
|
// CIDR and a node CIDR, or nil if disjoint.
|
|
func intersectCIDR(ann, node *net.IPNet) *net.IPNet {
|
|
// Same address family only.
|
|
if (ann.IP.To4() == nil) != (node.IP.To4() == nil) {
|
|
return nil
|
|
}
|
|
switch {
|
|
case cidrEqual(ann, node):
|
|
return node
|
|
case cidrContains(ann, node):
|
|
// ann is supernet of node → allocate from node.
|
|
return node
|
|
case cidrContains(node, ann):
|
|
// ann is subnet of node → allocate from ann.
|
|
return ann
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func cidrEqual(a, b *net.IPNet) bool {
|
|
if !a.IP.Equal(b.IP) {
|
|
return false
|
|
}
|
|
ao, _ := a.Mask.Size()
|
|
bo, _ := b.Mask.Size()
|
|
return ao == bo
|
|
}
|
|
|
|
// cidrContains returns true if `a` is a strict supernet of `b` (a ⊋ b).
|
|
func cidrContains(a, b *net.IPNet) bool {
|
|
ao, _ := a.Mask.Size()
|
|
bo, _ := b.Mask.Size()
|
|
if ao >= bo {
|
|
return false
|
|
}
|
|
return a.Contains(b.IP)
|
|
}
|
|
|
|
func toStringSlice(ns []*net.IPNet) []string {
|
|
out := make([]string, len(ns))
|
|
for i, n := range ns {
|
|
out[i] = n.String()
|
|
}
|
|
return out
|
|
}
|
|
|
|
// canonical returns the textual form of ip in its native family, so the same
|
|
// host address is always represented identically regardless of whether it
|
|
// arrived as a 4-byte slice, a 16-byte v4-in-v6 slice, or a string-parsed
|
|
// net.IP. Used as the key for the in-use map.
|
|
//
|
|
// Returns "" for nil input — callers MUST treat the returned key as opaque
|
|
// and never use the empty string as a sentinel.
|
|
func canonical(ip net.IP) string {
|
|
if ip == nil {
|
|
return ""
|
|
}
|
|
if v4 := ip.To4(); v4 != nil {
|
|
return v4.String()
|
|
}
|
|
if v16 := ip.To16(); v16 != nil {
|
|
return v16.String()
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ipToU32 reads a 4-byte IPv4 net.IP into a uint32. The caller is expected
|
|
// to have already validated that ip is an IPv4 address; mis-use returns 0
|
|
// rather than panicking.
|
|
func ipToU32(ip net.IP) uint32 {
|
|
v4 := ip.To4()
|
|
if v4 == nil {
|
|
return 0
|
|
}
|
|
return uint32(v4[0])<<24 | uint32(v4[1])<<16 | uint32(v4[2])<<8 | uint32(v4[3])
|
|
}
|
|
|
|
func u32ToIP(u uint32) net.IP {
|
|
return net.IPv4(byte(u>>24), byte(u>>16), byte(u>>8), byte(u)).To4()
|
|
}
|