Files

410 lines
12 KiB
Go
Raw Permalink Normal View History

package agent
import (
"crypto/rand"
"fmt"
"math/big"
"net"
"sync"
"code.fritzlab.net/fritzlab/flock/pkg/embed"
)
// IPAM allocates per-pod IPv6 and IPv4 addresses from the NodeConfig CIDRs.
//
// Invariants:
// - Concurrent callers serialize through mu.
// - Allocate() never returns an address already marked in-use. Collisions
// trigger retry (IPv6: up to 16 regenerations of the random N nibble or
// random IID; IPv4: linear scan of the effective /24 or smaller).
// - Release is idempotent.
//
// IPAM is constructed per-agent; it holds no durable state. The caller is
// responsible for marking pre-existing committed allocations in-use via
// MarkInUse during startup reconciliation.
type IPAM struct {
mu sync.Mutex
cidr6 []*net.IPNet
cidr4 []*net.IPNet
used map[string]struct{} // canonical IP strings
randSrc randSource
}
// randSource is injectable so tests can use a deterministic source.
type randSource interface {
// NibbleN returns a random 4-bit value.
NibbleN() byte
// FillIID fills dst with random bytes.
FillIID(dst []byte)
// PickIndex returns a pseudo-random int in [0, n). Tests may pin this.
PickIndex(n int) int
}
// cryptoRand uses crypto/rand for real allocations.
type cryptoRand struct{}
func (cryptoRand) NibbleN() byte {
var b [1]byte
_, _ = rand.Read(b[:])
return b[0] & 0x0F
}
func (cryptoRand) FillIID(dst []byte) {
_, _ = rand.Read(dst)
}
func (cryptoRand) PickIndex(n int) int {
if n <= 1 {
return 0
}
big_, _ := rand.Int(rand.Reader, big.NewInt(int64(n)))
return int(big_.Int64())
}
// AllocRequest describes a pending allocation. Values come from Pod metadata
// + annotations at CNI ADD time, with per-node FamilyDefaults already merged
// in (see ParseAnnotations).
type AllocRequest struct {
ContainerID string
Namespace string
// Pod is the literal pod name (used for logging only — not embedded).
Pod string
// App is the stable workload identity for the FieldApp embed field —
// typically the owning Deployment / StatefulSet / DaemonSet name.
// Computed by the handler; falls back to Pod when no usable owner is
// found (bare pods).
App string
// WantV6 / WantV4 are the post-merge address family selection (pod
// annotation > NodeConfig.Spec.Defaults > built-in baseline of
// dual-stack). At least one MUST be true; Allocate rejects the request
// otherwise.
WantV6 bool
WantV4 bool
// AnnCIDR6 / AnnCIDR4 come from the cidr6 / cidr4 annotations. Empty
// means "use any of the node's CIDRs".
AnnCIDR6 []*net.IPNet
AnnCIDR4 []*net.IPNet
// IPAlgo comes from the resolved ip-algo precedence chain. Empty means
// random IID.
IPAlgo []embed.Field
// Image is the spec'd image reference (typically
// pod.Spec.Containers[0].Image). When 64 hex chars, treated as a
// sha256 digest; otherwise FNV-1a-64'd as a string. Empty falls back
// to FNV(ContainerID) for ip-algo fields that reference image.
Image string
}
// AllocResult is what the IPAM hands back to the CNI ADD.
type AllocResult struct {
IP6 net.IP // nil if WantV6 was false
IP4 net.IP
}
// NewIPAM constructs an IPAM seeded from NodeConfig CIDRs.
func NewIPAM(cidr6, cidr4 []string) (*IPAM, error) {
i := &IPAM{
used: map[string]struct{}{},
randSrc: cryptoRand{},
}
for _, s := range cidr6 {
_, n, err := net.ParseCIDR(s)
if err != nil {
return nil, fmt.Errorf("cidr6 %q: %w", s, err)
}
if n.IP.To4() != nil {
return nil, fmt.Errorf("cidr6 %q is IPv4", s)
}
i.cidr6 = append(i.cidr6, n)
}
for _, s := range cidr4 {
_, n, err := net.ParseCIDR(s)
if err != nil {
return nil, fmt.Errorf("cidr4 %q: %w", s, err)
}
if n.IP.To4() == nil {
return nil, fmt.Errorf("cidr4 %q is not IPv4", s)
}
i.cidr4 = append(i.cidr4, n)
}
return i, nil
}
// MarkInUse records that `ip` is already taken. Used during startup
// reconciliation to seed the allocator from committed state.
func (i *IPAM) MarkInUse(ip net.IP) {
if ip == nil {
return
}
i.mu.Lock()
i.used[canonical(ip)] = struct{}{}
i.mu.Unlock()
}
// Release marks one or more IPs as free. Safe to call for IPs not in use.
func (i *IPAM) Release(ips ...net.IP) {
i.mu.Lock()
defer i.mu.Unlock()
for _, ip := range ips {
if ip != nil {
delete(i.used, canonical(ip))
}
}
}
// Allocate resolves effective CIDRs from the request + node CIDRs, picks
// addresses, and records them as in-use. It is atomic with respect to other
// Allocate / Release / MarkInUse calls.
func (i *IPAM) Allocate(req AllocRequest) (AllocResult, error) {
if !req.WantV6 && !req.WantV4 {
return AllocResult{}, fmt.Errorf("pod %s/%s: at least one of ipv6/ipv4 must be true", req.Namespace, req.Pod)
}
i.mu.Lock()
defer i.mu.Unlock()
var out AllocResult
var claimed []net.IP
rollback := func() {
for _, ip := range claimed {
delete(i.used, canonical(ip))
}
}
if req.WantV6 {
eff, err := resolveEffective(req.AnnCIDR6, i.cidr6)
if err != nil {
return AllocResult{}, fmt.Errorf("pod %s/%s cidr6: %w", req.Namespace, req.Pod, err)
}
cidr := eff[i.randSrc.PickIndex(len(eff))]
ip, err := i.allocV6(cidr, req)
if err != nil {
rollback()
return AllocResult{}, err
}
i.used[canonical(ip)] = struct{}{}
claimed = append(claimed, ip)
out.IP6 = ip
}
if req.WantV4 {
eff, err := resolveEffective(req.AnnCIDR4, i.cidr4)
if err != nil {
rollback()
return AllocResult{}, fmt.Errorf("pod %s/%s cidr4: %w", req.Namespace, req.Pod, err)
}
cidr := eff[i.randSrc.PickIndex(len(eff))]
ip, err := i.allocV4(cidr)
if err != nil {
rollback()
return AllocResult{}, err
}
i.used[canonical(ip)] = struct{}{}
out.IP4 = ip
}
return out, nil
}
// allocV6 picks an IPv6 /128 from the given CIDR, retrying on collision.
// Caller holds i.mu.
func (i *IPAM) allocV6(cidr *net.IPNet, req AllocRequest) (net.IP, error) {
const maxAttempts = 16
for attempt := 0; attempt < maxAttempts; attempt++ {
var ip net.IP
var err error
if len(req.IPAlgo) == 0 {
ip, err = i.randomV6(cidr)
} else {
ip, err = embed.Embed(cidr, req.IPAlgo, embed.Values{
Namespace: req.Namespace,
App: req.App,
Image: req.Image,
ImageFallback: req.ContainerID,
}, i.randSrc.NibbleN())
}
if err != nil {
return nil, err
}
if _, clash := i.used[canonical(ip)]; !clash {
return ip, nil
}
}
return nil, fmt.Errorf("IPv6 allocation: %d collisions in %s — giving up", maxAttempts, cidr)
}
// randomV6 picks a random /128 inside cidr. The network prefix bits are
// preserved from cidr.IP; the host bits are filled from the random source.
//
// Implementation: walk the 16 IPv6 bytes once. For each byte we ask whether
// it's entirely inside the network mask (skip), entirely inside the host
// portion (overwrite with random), or split (combine bits from both).
func (i *IPAM) randomV6(cidr *net.IPNet) (net.IP, error) {
ones, bits := cidr.Mask.Size()
if bits != 128 {
return nil, fmt.Errorf("cidr %s is not IPv6", cidr)
}
out := make(net.IP, net.IPv6len)
copy(out, cidr.IP.To16())
rnd := make([]byte, net.IPv6len)
i.randSrc.FillIID(rnd)
for b := 0; b < net.IPv6len; b++ {
byteStart := b * 8
byteEnd := byteStart + 8
switch {
case byteEnd <= ones:
// Entirely inside the network prefix — leave untouched.
continue
case byteStart >= ones:
// Entirely inside the host portion — fully randomise.
out[b] = rnd[b]
default:
// Split byte: top (ones-byteStart) bits are network, rest host.
networkBits := ones - byteStart
hostMask := byte(0xFF) >> uint(networkBits)
out[b] = (out[b] & ^hostMask) | (rnd[b] & hostMask)
}
}
return out, nil
}
// allocV4 walks the CIDR linearly skipping network + broadcast addresses.
// Caller holds i.mu.
func (i *IPAM) allocV4(cidr *net.IPNet) (net.IP, error) {
ones, _ := cidr.Mask.Size()
total := uint64(1) << uint(32-ones)
if total < 4 {
return nil, fmt.Errorf("cidr %s has no usable host space", cidr)
}
base := ipToU32(cidr.IP.To4())
// Skip .0 (network) and .1 (reserved for gateway / routing convention) up to .<broadcast-1>.
for off := uint64(2); off < total-1; off++ {
ip := u32ToIP(base + uint32(off))
if _, clash := i.used[canonical(ip)]; !clash {
return ip, nil
}
}
return nil, fmt.Errorf("IPv4 allocation: %s exhausted", cidr)
}
// resolveEffective applies the cidr6/cidr4 annotation → node CIDR intersection
// rules from the design doc. Returns the list of CIDRs the allocator may
// actually allocate from. No intersection → error.
//
// Rules (from dfritz-cni.md):
//
// annCIDR == nodeCIDR → allocate from nodeCIDR
// annCIDR supernet of → allocate from the more specific nodeCIDR
// annCIDR subnet of → allocate from annCIDR (more restrictive)
// no overlap → caller error
//
// If annCIDRs is empty, all nodeCIDRs are eligible.
// If annCIDRs has multiple entries, the *first* that intersects any
// nodeCIDR wins, matching the design-doc phrasing "the agent uses the first
// one that intersects a node CIDR".
func resolveEffective(annCIDRs []*net.IPNet, nodeCIDRs []*net.IPNet) ([]*net.IPNet, error) {
if len(nodeCIDRs) == 0 {
return nil, fmt.Errorf("node has no CIDRs configured for this family")
}
if len(annCIDRs) == 0 {
return nodeCIDRs, nil
}
for _, ann := range annCIDRs {
var matches []*net.IPNet
for _, node := range nodeCIDRs {
m := intersectCIDR(ann, node)
if m != nil {
matches = append(matches, m)
}
}
if len(matches) > 0 {
return matches, nil
}
}
return nil, fmt.Errorf("annotation CIDRs %v do not intersect any node CIDR %v",
toStringSlice(annCIDRs), toStringSlice(nodeCIDRs))
}
// intersectCIDR returns the effective allocation range between an annotation
// CIDR and a node CIDR, or nil if disjoint.
func intersectCIDR(ann, node *net.IPNet) *net.IPNet {
// Same address family only.
if (ann.IP.To4() == nil) != (node.IP.To4() == nil) {
return nil
}
switch {
case cidrEqual(ann, node):
return node
case cidrContains(ann, node):
// ann is supernet of node → allocate from node.
return node
case cidrContains(node, ann):
// ann is subnet of node → allocate from ann.
return ann
default:
return nil
}
}
func cidrEqual(a, b *net.IPNet) bool {
if !a.IP.Equal(b.IP) {
return false
}
ao, _ := a.Mask.Size()
bo, _ := b.Mask.Size()
return ao == bo
}
// cidrContains returns true if `a` is a strict supernet of `b` (a ⊋ b).
func cidrContains(a, b *net.IPNet) bool {
ao, _ := a.Mask.Size()
bo, _ := b.Mask.Size()
if ao >= bo {
return false
}
return a.Contains(b.IP)
}
func toStringSlice(ns []*net.IPNet) []string {
out := make([]string, len(ns))
for i, n := range ns {
out[i] = n.String()
}
return out
}
// canonical returns the textual form of ip in its native family, so the same
// host address is always represented identically regardless of whether it
// arrived as a 4-byte slice, a 16-byte v4-in-v6 slice, or a string-parsed
// net.IP. Used as the key for the in-use map.
//
// Returns "" for nil input — callers MUST treat the returned key as opaque
// and never use the empty string as a sentinel.
func canonical(ip net.IP) string {
if ip == nil {
return ""
}
if v4 := ip.To4(); v4 != nil {
return v4.String()
}
if v16 := ip.To16(); v16 != nil {
return v16.String()
}
return ""
}
// ipToU32 reads a 4-byte IPv4 net.IP into a uint32. The caller is expected
// to have already validated that ip is an IPv4 address; mis-use returns 0
// rather than panicking.
func ipToU32(ip net.IP) uint32 {
v4 := ip.To4()
if v4 == nil {
return 0
}
return uint32(v4[0])<<24 | uint32(v4[1])<<16 | uint32(v4[2])<<8 | uint32(v4[3])
}
func u32ToIP(u uint32) net.IP {
return net.IPv4(byte(u>>24), byte(u>>16), byte(u>>8), byte(u)).To4()
}