package agent import ( "crypto/rand" "fmt" "math/big" "net" "sync" "code.fritzlab.net/fritzlab/flock/pkg/embed" ) // IPAM allocates per-pod IPv6 and IPv4 addresses from the NodeConfig CIDRs. // // Invariants: // - Concurrent callers serialize through mu. // - Allocate() never returns an address already marked in-use. Collisions // trigger retry (IPv6: up to 16 regenerations of the random N nibble or // random IID; IPv4: linear scan of the effective /24 or smaller). // - Release is idempotent. // // IPAM is constructed per-agent; it holds no durable state. The caller is // responsible for marking pre-existing committed allocations in-use via // MarkInUse during startup reconciliation. type IPAM struct { mu sync.Mutex cidr6 []*net.IPNet cidr4 []*net.IPNet used map[string]struct{} // canonical IP strings randSrc randSource } // randSource is injectable so tests can use a deterministic source. type randSource interface { // NibbleN returns a random 4-bit value. NibbleN() byte // FillIID fills dst with random bytes. FillIID(dst []byte) // PickIndex returns a pseudo-random int in [0, n). Tests may pin this. PickIndex(n int) int } // cryptoRand uses crypto/rand for real allocations. type cryptoRand struct{} func (cryptoRand) NibbleN() byte { var b [1]byte _, _ = rand.Read(b[:]) return b[0] & 0x0F } func (cryptoRand) FillIID(dst []byte) { _, _ = rand.Read(dst) } func (cryptoRand) PickIndex(n int) int { if n <= 1 { return 0 } big_, _ := rand.Int(rand.Reader, big.NewInt(int64(n))) return int(big_.Int64()) } // AllocRequest describes a pending allocation. Values come from Pod metadata // + annotations at CNI ADD time, with per-node FamilyDefaults already merged // in (see ParseAnnotations). type AllocRequest struct { ContainerID string Namespace string Pod string // WantV6 / WantV4 are the post-merge address family selection (pod // annotation > NodeConfig.Spec.Defaults > built-in baseline of // dual-stack). At least one MUST be true; Allocate rejects the request // otherwise. WantV6 bool WantV4 bool // AnnCIDR6 / AnnCIDR4 come from the cidr6 / cidr4 annotations. Empty // means "use any of the node's CIDRs". AnnCIDR6 []*net.IPNet AnnCIDR4 []*net.IPNet // IPAlgo comes from the ip-algo annotation. Empty means random IID. IPAlgo []embed.Field // ImageDigest is the sha256 manifest digest (with or without "sha256:" // prefix). If empty, embed.Values.ImageFallback = ContainerID is used // for ip-algo fields that reference image. ImageDigest string } // AllocResult is what the IPAM hands back to the CNI ADD. type AllocResult struct { IP6 net.IP // nil if WantV6 was false IP4 net.IP } // NewIPAM constructs an IPAM seeded from NodeConfig CIDRs. func NewIPAM(cidr6, cidr4 []string) (*IPAM, error) { i := &IPAM{ used: map[string]struct{}{}, randSrc: cryptoRand{}, } for _, s := range cidr6 { _, n, err := net.ParseCIDR(s) if err != nil { return nil, fmt.Errorf("cidr6 %q: %w", s, err) } if n.IP.To4() != nil { return nil, fmt.Errorf("cidr6 %q is IPv4", s) } i.cidr6 = append(i.cidr6, n) } for _, s := range cidr4 { _, n, err := net.ParseCIDR(s) if err != nil { return nil, fmt.Errorf("cidr4 %q: %w", s, err) } if n.IP.To4() == nil { return nil, fmt.Errorf("cidr4 %q is not IPv4", s) } i.cidr4 = append(i.cidr4, n) } return i, nil } // MarkInUse records that `ip` is already taken. Used during startup // reconciliation to seed the allocator from committed state. func (i *IPAM) MarkInUse(ip net.IP) { if ip == nil { return } i.mu.Lock() i.used[canonical(ip)] = struct{}{} i.mu.Unlock() } // Release marks one or more IPs as free. Safe to call for IPs not in use. func (i *IPAM) Release(ips ...net.IP) { i.mu.Lock() defer i.mu.Unlock() for _, ip := range ips { if ip != nil { delete(i.used, canonical(ip)) } } } // Allocate resolves effective CIDRs from the request + node CIDRs, picks // addresses, and records them as in-use. It is atomic with respect to other // Allocate / Release / MarkInUse calls. func (i *IPAM) Allocate(req AllocRequest) (AllocResult, error) { if !req.WantV6 && !req.WantV4 { return AllocResult{}, fmt.Errorf("pod %s/%s: at least one of ipv6/ipv4 must be true", req.Namespace, req.Pod) } i.mu.Lock() defer i.mu.Unlock() var out AllocResult var claimed []net.IP rollback := func() { for _, ip := range claimed { delete(i.used, canonical(ip)) } } if req.WantV6 { eff, err := resolveEffective(req.AnnCIDR6, i.cidr6) if err != nil { return AllocResult{}, fmt.Errorf("pod %s/%s cidr6: %w", req.Namespace, req.Pod, err) } cidr := eff[i.randSrc.PickIndex(len(eff))] ip, err := i.allocV6(cidr, req) if err != nil { rollback() return AllocResult{}, err } i.used[canonical(ip)] = struct{}{} claimed = append(claimed, ip) out.IP6 = ip } if req.WantV4 { eff, err := resolveEffective(req.AnnCIDR4, i.cidr4) if err != nil { rollback() return AllocResult{}, fmt.Errorf("pod %s/%s cidr4: %w", req.Namespace, req.Pod, err) } cidr := eff[i.randSrc.PickIndex(len(eff))] ip, err := i.allocV4(cidr) if err != nil { rollback() return AllocResult{}, err } i.used[canonical(ip)] = struct{}{} out.IP4 = ip } return out, nil } // allocV6 picks an IPv6 /128 from the given CIDR, retrying on collision. // Caller holds i.mu. func (i *IPAM) allocV6(cidr *net.IPNet, req AllocRequest) (net.IP, error) { const maxAttempts = 16 for attempt := 0; attempt < maxAttempts; attempt++ { var ip net.IP var err error if len(req.IPAlgo) == 0 { ip, err = i.randomV6(cidr) } else { ip, err = embed.Embed(cidr, req.IPAlgo, embed.Values{ Namespace: req.Namespace, Pod: req.Pod, Image: req.ImageDigest, ImageFallback: req.ContainerID, }, i.randSrc.NibbleN()) } if err != nil { return nil, err } if _, clash := i.used[canonical(ip)]; !clash { return ip, nil } } return nil, fmt.Errorf("IPv6 allocation: %d collisions in %s — giving up", maxAttempts, cidr) } // randomV6 picks a random /128 inside cidr. The network prefix bits are // preserved from cidr.IP; the host bits are filled from the random source. // // Implementation: walk the 16 IPv6 bytes once. For each byte we ask whether // it's entirely inside the network mask (skip), entirely inside the host // portion (overwrite with random), or split (combine bits from both). func (i *IPAM) randomV6(cidr *net.IPNet) (net.IP, error) { ones, bits := cidr.Mask.Size() if bits != 128 { return nil, fmt.Errorf("cidr %s is not IPv6", cidr) } out := make(net.IP, net.IPv6len) copy(out, cidr.IP.To16()) rnd := make([]byte, net.IPv6len) i.randSrc.FillIID(rnd) for b := 0; b < net.IPv6len; b++ { byteStart := b * 8 byteEnd := byteStart + 8 switch { case byteEnd <= ones: // Entirely inside the network prefix — leave untouched. continue case byteStart >= ones: // Entirely inside the host portion — fully randomise. out[b] = rnd[b] default: // Split byte: top (ones-byteStart) bits are network, rest host. networkBits := ones - byteStart hostMask := byte(0xFF) >> uint(networkBits) out[b] = (out[b] & ^hostMask) | (rnd[b] & hostMask) } } return out, nil } // allocV4 walks the CIDR linearly skipping network + broadcast addresses. // Caller holds i.mu. func (i *IPAM) allocV4(cidr *net.IPNet) (net.IP, error) { ones, _ := cidr.Mask.Size() total := uint64(1) << uint(32-ones) if total < 4 { return nil, fmt.Errorf("cidr %s has no usable host space", cidr) } base := ipToU32(cidr.IP.To4()) // Skip .0 (network) and .1 (reserved for gateway / routing convention) up to .. for off := uint64(2); off < total-1; off++ { ip := u32ToIP(base + uint32(off)) if _, clash := i.used[canonical(ip)]; !clash { return ip, nil } } return nil, fmt.Errorf("IPv4 allocation: %s exhausted", cidr) } // resolveEffective applies the cidr6/cidr4 annotation → node CIDR intersection // rules from the design doc. Returns the list of CIDRs the allocator may // actually allocate from. No intersection → error. // // Rules (from dfritz-cni.md): // // annCIDR == nodeCIDR → allocate from nodeCIDR // annCIDR supernet of → allocate from the more specific nodeCIDR // annCIDR subnet of → allocate from annCIDR (more restrictive) // no overlap → caller error // // If annCIDRs is empty, all nodeCIDRs are eligible. // If annCIDRs has multiple entries, the *first* that intersects any // nodeCIDR wins, matching the design-doc phrasing "the agent uses the first // one that intersects a node CIDR". func resolveEffective(annCIDRs []*net.IPNet, nodeCIDRs []*net.IPNet) ([]*net.IPNet, error) { if len(nodeCIDRs) == 0 { return nil, fmt.Errorf("node has no CIDRs configured for this family") } if len(annCIDRs) == 0 { return nodeCIDRs, nil } for _, ann := range annCIDRs { var matches []*net.IPNet for _, node := range nodeCIDRs { m := intersectCIDR(ann, node) if m != nil { matches = append(matches, m) } } if len(matches) > 0 { return matches, nil } } return nil, fmt.Errorf("annotation CIDRs %v do not intersect any node CIDR %v", toStringSlice(annCIDRs), toStringSlice(nodeCIDRs)) } // intersectCIDR returns the effective allocation range between an annotation // CIDR and a node CIDR, or nil if disjoint. func intersectCIDR(ann, node *net.IPNet) *net.IPNet { // Same address family only. if (ann.IP.To4() == nil) != (node.IP.To4() == nil) { return nil } switch { case cidrEqual(ann, node): return node case cidrContains(ann, node): // ann is supernet of node → allocate from node. return node case cidrContains(node, ann): // ann is subnet of node → allocate from ann. return ann default: return nil } } func cidrEqual(a, b *net.IPNet) bool { if !a.IP.Equal(b.IP) { return false } ao, _ := a.Mask.Size() bo, _ := b.Mask.Size() return ao == bo } // cidrContains returns true if `a` is a strict supernet of `b` (a ⊋ b). func cidrContains(a, b *net.IPNet) bool { ao, _ := a.Mask.Size() bo, _ := b.Mask.Size() if ao >= bo { return false } return a.Contains(b.IP) } func toStringSlice(ns []*net.IPNet) []string { out := make([]string, len(ns)) for i, n := range ns { out[i] = n.String() } return out } // canonical returns the textual form of ip in its native family, so the same // host address is always represented identically regardless of whether it // arrived as a 4-byte slice, a 16-byte v4-in-v6 slice, or a string-parsed // net.IP. Used as the key for the in-use map. // // Returns "" for nil input — callers MUST treat the returned key as opaque // and never use the empty string as a sentinel. func canonical(ip net.IP) string { if ip == nil { return "" } if v4 := ip.To4(); v4 != nil { return v4.String() } if v16 := ip.To16(); v16 != nil { return v16.String() } return "" } // ipToU32 reads a 4-byte IPv4 net.IP into a uint32. The caller is expected // to have already validated that ip is an IPv4 address; mis-use returns 0 // rather than panicking. func ipToU32(ip net.IP) uint32 { v4 := ip.To4() if v4 == nil { return 0 } return uint32(v4[0])<<24 | uint32(v4[1])<<16 | uint32(v4[2])<<8 | uint32(v4[3]) } func u32ToIP(u uint32) net.IP { return net.IPv4(byte(u>>24), byte(u>>16), byte(u>>8), byte(u)).To4() }