package agent import ( "context" "fmt" "log/slog" "net" "strings" "time" flockcni "code.fritzlab.net/fritzlab/flock/pkg/cni" cnitypes "github.com/containernetworking/cni/pkg/types" current "github.com/containernetworking/cni/pkg/types/100" corev1 "k8s.io/api/core/v1" ) // podTemplateHashLabel is the well-known label Kubernetes attaches to // every Pod owned by a ReplicaSet so the ReplicaSet name can be // reconstructed as "-". We use it to peel the hash back off // in deriveAppName. const podTemplateHashLabel = "pod-template-hash" // deriveAppName returns the stable workload identifier for a Pod — the // name of the topmost stable controller, with the pod-template-hash // stripped for ReplicaSet-owned pods. // // The rule maps to Kubernetes pod-name generation: // // Deployment → ReplicaSet → Pod pod owner is RS named "-"; // strip the trailing "-" to recover // the Deployment name. // StatefulSet → Pod pod owner is the STS itself; use as-is. // DaemonSet → Pod pod owner is the DS itself; use as-is. // Job → Pod pod owner is the Job itself; use as-is. // (bare pod) → Pod no controller owner; fall back to pod name. // // All replicas of the same workload converge on the same return value, // which is the property the ip-algo `app` field needs. func deriveAppName(pod *corev1.Pod) string { owner := controllerOwner(pod) if owner == nil { return pod.Name } if owner.Kind == "ReplicaSet" { if hash, ok := pod.Labels[podTemplateHashLabel]; ok && hash != "" { suffix := "-" + hash if strings.HasSuffix(owner.Name, suffix) { return strings.TrimSuffix(owner.Name, suffix) } } // Custom controller named the RS something that doesn't match // the pod-template-hash convention. Falling back to the RS name // keeps replicas of the same RS aligned, which is the second- // best correctness we can offer. return owner.Name } return owner.Name } // controllerOwner returns the OwnerReference flagged with Controller=true, // or nil if none. Kubernetes guarantees at most one controller per object. func controllerOwner(pod *corev1.Pod) *metav1OwnerLite { for i := range pod.OwnerReferences { o := &pod.OwnerReferences[i] if o.Controller != nil && *o.Controller { return &metav1OwnerLite{Kind: o.Kind, Name: o.Name} } } return nil } // metav1OwnerLite is the slice of OwnerReference we actually consult, // kept tiny so it can be returned by value-pointer cheaply. type metav1OwnerLite struct { Kind string Name string } // podImageRef returns a deterministic image reference for the embed // `image` field. We use the first container's spec'd image — this is // stable across replicas of the same Deployment without requiring the // runtime-resolved digest. Empty string if the pod has no containers, // in which case the embed package falls back to FNV(containerID). func podImageRef(pod *corev1.Pod) string { if len(pod.Spec.Containers) == 0 { return "" } return pod.Spec.Containers[0].Image } // PodHandler is the platform-agnostic ADD/DEL/CHECK implementation. It // resolves the Pod from the informer cache, parses annotations, allocates // from IPAM, programs netns (or skips on non-Linux build), and persists // state. The netns ops are split into Setup/Teardown so platform stubs can // keep the rest of the orchestration testable. type PodHandler struct { Node string Store *Store IPAM *IPAM Pods *PodCache NodeConfig *NodeConfigCache Logger *slog.Logger // SetupFunc and TeardownFunc are injected at startup; in production // they point at the Linux netlink ops, in tests they're fakes. SetupFunc func(SetupRequest) error TeardownFunc func(containerID string, ip6, ip4 net.IP) error // AfterCommit is called after a successful ADD/DEL with the // post-mutation Snapshot — used to refresh BIRD config. AfterCommit func() } // Add implements the CNI ADD path. func (h *PodHandler) Add(ctx context.Context, req flockcni.Request) (*current.Result, error) { args := ParseCNIArgs(req.Args) if args.PodName == "" || args.PodNamespace == "" { return nil, fmt.Errorf("CNI_ARGS missing K8S_POD_NAMESPACE/NAME") } // Idempotency: if we already committed this containerID, return the // existing IPs. kubelet retries ADD on the same sandbox. if existing, ok := h.Store.Get(req.ContainerID); ok && existing.State == StateCommitted { return resultFromAllocation(req.IfName, existing), nil } pod, err := h.Pods.WaitForPod(ctx, args.PodNamespace, args.PodName, 3*time.Second) if err != nil { return nil, fmt.Errorf("lookup pod: %w", err) } nc := h.NodeConfig.Load() defaults := FamilyDefaultsFromNodeConfig(nc) parsed, err := ParseAnnotations(pod.Annotations, defaults) if err != nil { return nil, fmt.Errorf("parse annotations: %w", err) } var nodeAnn map[string]string if nc != nil { nodeAnn = nc.GetAnnotations() } ipAlgo := ResolveIPAlgo(pod.Annotations, nodeAnn, h.Logger) allocReq := AllocRequest{ ContainerID: req.ContainerID, Namespace: args.PodNamespace, Pod: args.PodName, App: deriveAppName(pod), WantV6: parsed.WantV6, WantV4: parsed.WantV4, AnnCIDR6: parsed.CIDR6, AnnCIDR4: parsed.CIDR4, IPAlgo: ipAlgo, Image: podImageRef(pod), } res, err := h.IPAM.Allocate(allocReq) if err != nil { return nil, fmt.Errorf("ipam: %w", err) } // Persist pending entry before any netlink work so a crash mid-ADD // leaves recoverable state. pending := Allocation{ ContainerID: req.ContainerID, Namespace: args.PodNamespace, PodName: args.PodName, OwnerUID: string(pod.UID), IP6: ipString(res.IP6), IP4: ipString(res.IP4), Anycast: anycastStrings(parsed.Anycast), Addresses: anycastStrings(parsed.Addresses), State: StatePending, AllocatedAt: time.Now().UTC(), } if err := h.Store.Upsert(pending); err != nil { h.IPAM.Release(res.IP6, res.IP4) return nil, fmt.Errorf("store pending: %w", err) } setup := SetupRequest{ ContainerID: req.ContainerID, Netns: req.Netns, IfName: req.IfName, HostIface: HostIfaceName(req.ContainerID), IP6: res.IP6, IP4: res.IP4, Anycast: parsed.Anycast, Addresses: parsed.Addresses, } if err := h.SetupFunc(setup); err != nil { // Roll forward: leave pending entry in place so startup GC can clean // up the partial netns; let kubelet retry ADD. return nil, fmt.Errorf("netns setup: %w", err) } committed := pending committed.State = StateCommitted if err := h.Store.Upsert(committed); err != nil { return nil, fmt.Errorf("store commit: %w", err) } if h.AfterCommit != nil { h.AfterCommit() } return resultFromAllocation(req.IfName, committed), nil } // Del implements CNI DEL. Idempotent. func (h *PodHandler) Del(ctx context.Context, req flockcni.Request) error { entry, ok := h.Store.Get(req.ContainerID) if !ok { return nil } ip6 := net.ParseIP(entry.IP6) ip4 := net.ParseIP(entry.IP4) if err := h.TeardownFunc(req.ContainerID, ip6, ip4); err != nil { return fmt.Errorf("netns teardown: %w", err) } if err := h.Store.Delete(req.ContainerID); err != nil { return fmt.Errorf("store delete: %w", err) } h.IPAM.Release(ip6, ip4) if h.AfterCommit != nil { h.AfterCommit() } return nil } // Check verifies that the persisted state is consistent. M2 minimum: just // look up the entry; full kernel-state comparison is M7. func (h *PodHandler) Check(_ context.Context, req flockcni.Request) error { if _, ok := h.Store.Get(req.ContainerID); !ok { return cnitypes.NewError(cnitypes.ErrUnknownContainer, "flock-check", "container "+req.ContainerID+" has no allocation") } return nil } func resultFromAllocation(ifName string, a Allocation) *current.Result { r := ¤t.Result{CNIVersion: current.ImplementedSpecVersion} r.Interfaces = []*current.Interface{{Name: ifName, Sandbox: "pod"}} if a.IP6 != "" { ip6 := net.ParseIP(a.IP6) r.IPs = append(r.IPs, ¤t.IPConfig{ Interface: intPtr(0), Address: net.IPNet{IP: ip6, Mask: net.CIDRMask(128, 128)}, }) } if a.IP4 != "" { ip4 := net.ParseIP(a.IP4).To4() r.IPs = append(r.IPs, ¤t.IPConfig{ Interface: intPtr(0), Address: net.IPNet{IP: ip4, Mask: net.CIDRMask(32, 32)}, }) } // Addresses are assigned to eth0 and should appear in pod.status.podIPs // so Kubernetes and workloads that inspect pod metadata see them. for _, s := range a.Addresses { ip := net.ParseIP(s) if ip == nil { continue } if v4 := ip.To4(); v4 != nil { r.IPs = append(r.IPs, ¤t.IPConfig{ Interface: intPtr(0), Address: net.IPNet{IP: v4, Mask: net.CIDRMask(32, 32)}, }) } else { r.IPs = append(r.IPs, ¤t.IPConfig{ Interface: intPtr(0), Address: net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(128, 128)}, }) } } return r } func intPtr(i int) *int { return &i } func ipString(ip net.IP) string { if ip == nil { return "" } return canonical(ip) } func anycastStrings(ips []net.IP) []string { if len(ips) == 0 { return nil } out := make([]string, len(ips)) for i, ip := range ips { out[i] = canonical(ip) } return out }