flock/pkg/agent/ipam.go

package agent

import (
	"crypto/rand"
	"fmt"
	"math/big"
	"net"
	"sync"

	"code.fritzlab.net/fritzlab/flock/pkg/embed"
)

// IPAM allocates per-pod IPv6 and IPv4 addresses from the NodeConfig CIDRs.
//
// Invariants:
//   - Concurrent callers serialize through mu.
//   - Allocate() never returns an address already marked in-use. Collisions
//     trigger retry (IPv6: up to 16 regenerations of the random N nibble or
//     random IID; IPv4: linear scan of the effective /24 or smaller).
//   - Release is idempotent.
//
// IPAM is constructed per-agent; it holds no durable state. The caller is
// responsible for marking pre-existing committed allocations in-use via
// MarkInUse during startup reconciliation.
type IPAM struct {
	mu      sync.Mutex
	cidr6   []*net.IPNet
	cidr4   []*net.IPNet
	used    map[string]struct{} // canonical IP strings
	randSrc randSource
}

// randSource is injectable so tests can use a deterministic source.
type randSource interface {
	// NibbleN returns a random 4-bit value.
	NibbleN() byte
	// FillIID fills dst with random bytes.
	FillIID(dst []byte)
	// PickIndex returns a pseudo-random int in [0, n). Tests may pin this.
	PickIndex(n int) int
}

// cryptoRand uses crypto/rand for real allocations.
type cryptoRand struct{}

func (cryptoRand) NibbleN() byte {
	var b [1]byte
	_, _ = rand.Read(b[:])
	return b[0] & 0x0F
}

func (cryptoRand) FillIID(dst []byte) {
	_, _ = rand.Read(dst)
}

func (cryptoRand) PickIndex(n int) int {
	if n <= 1 {
		return 0
	}
	big_, _ := rand.Int(rand.Reader, big.NewInt(int64(n)))
	return int(big_.Int64())
}

// AllocRequest describes a pending allocation. Values come from Pod metadata
// + annotations at CNI ADD time, with per-node FamilyDefaults already merged
// in (see ParseAnnotations).
type AllocRequest struct {
	ContainerID string
	Namespace   string
	Pod         string
	// WantV6 / WantV4 are the post-merge address family selection (pod
	// annotation > NodeConfig.Spec.Defaults > built-in baseline of
	// dual-stack). At least one MUST be true; Allocate rejects the request
	// otherwise.
	WantV6 bool
	WantV4 bool
	// AnnCIDR6 / AnnCIDR4 come from the cidr6 / cidr4 annotations. Empty
	// means "use any of the node's CIDRs".
	AnnCIDR6 []*net.IPNet
	AnnCIDR4 []*net.IPNet
	// IPAlgo comes from the ip-algo annotation. Empty means random IID.
	IPAlgo []embed.Field
	// ImageDigest is the sha256 manifest digest (with or without "sha256:"
	// prefix). If empty, embed.Values.ImageFallback = ContainerID is used
	// for ip-algo fields that reference image.
	ImageDigest string
}

// AllocResult is what the IPAM hands back to the CNI ADD.
type AllocResult struct {
	IP6 net.IP // nil if WantV6 was false
	IP4 net.IP
}

// NewIPAM constructs an IPAM seeded from NodeConfig CIDRs.
func NewIPAM(cidr6, cidr4 []string) (*IPAM, error) {
	i := &IPAM{
		used:    map[string]struct{}{},
		randSrc: cryptoRand{},
	}
	for _, s := range cidr6 {
		_, n, err := net.ParseCIDR(s)
		if err != nil {
			return nil, fmt.Errorf("cidr6 %q: %w", s, err)
		}
		if n.IP.To4() != nil {
			return nil, fmt.Errorf("cidr6 %q is IPv4", s)
		}
		i.cidr6 = append(i.cidr6, n)
	}
	for _, s := range cidr4 {
		_, n, err := net.ParseCIDR(s)
		if err != nil {
			return nil, fmt.Errorf("cidr4 %q: %w", s, err)
		}
		if n.IP.To4() == nil {
			return nil, fmt.Errorf("cidr4 %q is not IPv4", s)
		}
		i.cidr4 = append(i.cidr4, n)
	}
	return i, nil
}

// MarkInUse records that `ip` is already taken. Used during startup
// reconciliation to seed the allocator from committed state.
func (i *IPAM) MarkInUse(ip net.IP) {
	if ip == nil {
		return
	}
	i.mu.Lock()
	i.used[canonical(ip)] = struct{}{}
	i.mu.Unlock()
}

// Release marks one or more IPs as free. Safe to call for IPs not in use.
func (i *IPAM) Release(ips ...net.IP) {
	i.mu.Lock()
	defer i.mu.Unlock()
	for _, ip := range ips {
		if ip != nil {
			delete(i.used, canonical(ip))
		}
	}
}

// Allocate resolves effective CIDRs from the request + node CIDRs, picks
// addresses, and records them as in-use. It is atomic with respect to other
// Allocate / Release / MarkInUse calls.
func (i *IPAM) Allocate(req AllocRequest) (AllocResult, error) {
	if !req.WantV6 && !req.WantV4 {
		return AllocResult{}, fmt.Errorf("pod %s/%s: at least one of ipv6/ipv4 must be true", req.Namespace, req.Pod)
	}

	i.mu.Lock()
	defer i.mu.Unlock()

	var out AllocResult
	var claimed []net.IP

	rollback := func() {
		for _, ip := range claimed {
			delete(i.used, canonical(ip))
		}
	}

	if req.WantV6 {
		eff, err := resolveEffective(req.AnnCIDR6, i.cidr6)
		if err != nil {
			return AllocResult{}, fmt.Errorf("pod %s/%s cidr6: %w", req.Namespace, req.Pod, err)
		}
		cidr := eff[i.randSrc.PickIndex(len(eff))]
		ip, err := i.allocV6(cidr, req)
		if err != nil {
			rollback()
			return AllocResult{}, err
		}
		i.used[canonical(ip)] = struct{}{}
		claimed = append(claimed, ip)
		out.IP6 = ip
	}

	if req.WantV4 {
		eff, err := resolveEffective(req.AnnCIDR4, i.cidr4)
		if err != nil {
			rollback()
			return AllocResult{}, fmt.Errorf("pod %s/%s cidr4: %w", req.Namespace, req.Pod, err)
		}
		cidr := eff[i.randSrc.PickIndex(len(eff))]
		ip, err := i.allocV4(cidr)
		if err != nil {
			rollback()
			return AllocResult{}, err
		}
		i.used[canonical(ip)] = struct{}{}
		out.IP4 = ip
	}

	return out, nil
}

// allocV6 picks an IPv6 /128 from the given CIDR, retrying on collision.
// Caller holds i.mu.
func (i *IPAM) allocV6(cidr *net.IPNet, req AllocRequest) (net.IP, error) {
	const maxAttempts = 16
	for attempt := 0; attempt < maxAttempts; attempt++ {
		var ip net.IP
		var err error
		if len(req.IPAlgo) == 0 {
			ip, err = i.randomV6(cidr)
		} else {
			ip, err = embed.Embed(cidr, req.IPAlgo, embed.Values{
				Namespace:     req.Namespace,
				Pod:           req.Pod,
				Image:         req.ImageDigest,
				ImageFallback: req.ContainerID,
			}, i.randSrc.NibbleN())
		}
		if err != nil {
			return nil, err
		}
		if _, clash := i.used[canonical(ip)]; !clash {
			return ip, nil
		}
	}
	return nil, fmt.Errorf("IPv6 allocation: %d collisions in %s — giving up", maxAttempts, cidr)
}

// randomV6 picks a random /128 inside cidr. The network prefix bits are
// preserved from cidr.IP; the host bits are filled from the random source.
//
// Implementation: walk the 16 IPv6 bytes once. For each byte we ask whether
// it's entirely inside the network mask (skip), entirely inside the host
// portion (overwrite with random), or split (combine bits from both).
func (i *IPAM) randomV6(cidr *net.IPNet) (net.IP, error) {
	ones, bits := cidr.Mask.Size()
	if bits != 128 {
		return nil, fmt.Errorf("cidr %s is not IPv6", cidr)
	}
	out := make(net.IP, net.IPv6len)
	copy(out, cidr.IP.To16())
	rnd := make([]byte, net.IPv6len)
	i.randSrc.FillIID(rnd)
	for b := 0; b < net.IPv6len; b++ {
		byteStart := b * 8
		byteEnd := byteStart + 8
		switch {
		case byteEnd <= ones:
			// Entirely inside the network prefix — leave untouched.
			continue
		case byteStart >= ones:
			// Entirely inside the host portion — fully randomise.
			out[b] = rnd[b]
		default:
			// Split byte: top (ones-byteStart) bits are network, rest host.
			networkBits := ones - byteStart
			hostMask := byte(0xFF) >> uint(networkBits)
			out[b] = (out[b] & ^hostMask) | (rnd[b] & hostMask)
		}
	}
	return out, nil
}

// allocV4 walks the CIDR linearly skipping network + broadcast addresses.
// Caller holds i.mu.
func (i *IPAM) allocV4(cidr *net.IPNet) (net.IP, error) {
	ones, _ := cidr.Mask.Size()
	total := uint64(1) << uint(32-ones)
	if total < 4 {
		return nil, fmt.Errorf("cidr %s has no usable host space", cidr)
	}
	base := ipToU32(cidr.IP.To4())
	// Skip .0 (network) and .1 (reserved for gateway / routing convention) up to .<broadcast-1>.
	for off := uint64(2); off < total-1; off++ {
		ip := u32ToIP(base + uint32(off))
		if _, clash := i.used[canonical(ip)]; !clash {
			return ip, nil
		}
	}
	return nil, fmt.Errorf("IPv4 allocation: %s exhausted", cidr)
}

// resolveEffective applies the cidr6/cidr4 annotation → node CIDR intersection
// rules from the design doc. Returns the list of CIDRs the allocator may
// actually allocate from. No intersection → error.
//
// Rules (from dfritz-cni.md):
//
//	annCIDR == nodeCIDR  → allocate from nodeCIDR
//	annCIDR supernet of  → allocate from the more specific nodeCIDR
//	annCIDR subnet of    → allocate from annCIDR (more restrictive)
//	no overlap           → caller error
//
// If annCIDRs is empty, all nodeCIDRs are eligible.
// If annCIDRs has multiple entries, the *first* that intersects any
// nodeCIDR wins, matching the design-doc phrasing "the agent uses the first
// one that intersects a node CIDR".
func resolveEffective(annCIDRs []*net.IPNet, nodeCIDRs []*net.IPNet) ([]*net.IPNet, error) {
	if len(nodeCIDRs) == 0 {
		return nil, fmt.Errorf("node has no CIDRs configured for this family")
	}
	if len(annCIDRs) == 0 {
		return nodeCIDRs, nil
	}
	for _, ann := range annCIDRs {
		var matches []*net.IPNet
		for _, node := range nodeCIDRs {
			m := intersectCIDR(ann, node)
			if m != nil {
				matches = append(matches, m)
			}
		}
		if len(matches) > 0 {
			return matches, nil
		}
	}
	return nil, fmt.Errorf("annotation CIDRs %v do not intersect any node CIDR %v",
		toStringSlice(annCIDRs), toStringSlice(nodeCIDRs))
}

// intersectCIDR returns the effective allocation range between an annotation
// CIDR and a node CIDR, or nil if disjoint.
func intersectCIDR(ann, node *net.IPNet) *net.IPNet {
	// Same address family only.
	if (ann.IP.To4() == nil) != (node.IP.To4() == nil) {
		return nil
	}
	switch {
	case cidrEqual(ann, node):
		return node
	case cidrContains(ann, node):
		// ann is supernet of node → allocate from node.
		return node
	case cidrContains(node, ann):
		// ann is subnet of node → allocate from ann.
		return ann
	default:
		return nil
	}
}

func cidrEqual(a, b *net.IPNet) bool {
	if !a.IP.Equal(b.IP) {
		return false
	}
	ao, _ := a.Mask.Size()
	bo, _ := b.Mask.Size()
	return ao == bo
}

// cidrContains returns true if `a` is a strict supernet of `b` (a ⊋ b).
func cidrContains(a, b *net.IPNet) bool {
	ao, _ := a.Mask.Size()
	bo, _ := b.Mask.Size()
	if ao >= bo {
		return false
	}
	return a.Contains(b.IP)
}

func toStringSlice(ns []*net.IPNet) []string {
	out := make([]string, len(ns))
	for i, n := range ns {
		out[i] = n.String()
	}
	return out
}

// canonical returns the textual form of ip in its native family, so the same
// host address is always represented identically regardless of whether it
// arrived as a 4-byte slice, a 16-byte v4-in-v6 slice, or a string-parsed
// net.IP. Used as the key for the in-use map.
//
// Returns "" for nil input — callers MUST treat the returned key as opaque
// and never use the empty string as a sentinel.
func canonical(ip net.IP) string {
	if ip == nil {
		return ""
	}
	if v4 := ip.To4(); v4 != nil {
		return v4.String()
	}
	if v16 := ip.To16(); v16 != nil {
		return v16.String()
	}
	return ""
}

// ipToU32 reads a 4-byte IPv4 net.IP into a uint32. The caller is expected
// to have already validated that ip is an IPv4 address; mis-use returns 0
// rather than panicking.
func ipToU32(ip net.IP) uint32 {
	v4 := ip.To4()
	if v4 == nil {
		return 0
	}
	return uint32(v4[0])<<24 | uint32(v4[1])<<16 | uint32(v4[2])<<8 | uint32(v4[3])
}

func u32ToIP(u uint32) net.IP {
	return net.IPv4(byte(u>>24), byte(u>>16), byte(u>>8), byte(u)).To4()
}