2026-04-24 22:33:48 -05:00
|
|
|
package agent
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"fmt"
|
2026-04-25 11:09:09 -05:00
|
|
|
"log/slog"
|
2026-04-24 22:33:48 -05:00
|
|
|
"net"
|
2026-04-25 11:42:06 -05:00
|
|
|
"strings"
|
2026-04-24 22:33:48 -05:00
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
flockcni "code.fritzlab.net/fritzlab/flock/pkg/cni"
|
|
|
|
|
cnitypes "github.com/containernetworking/cni/pkg/types"
|
|
|
|
|
current "github.com/containernetworking/cni/pkg/types/100"
|
2026-04-25 11:42:06 -05:00
|
|
|
corev1 "k8s.io/api/core/v1"
|
2026-04-24 22:33:48 -05:00
|
|
|
)
|
|
|
|
|
|
2026-04-25 11:42:06 -05:00
|
|
|
// podTemplateHashLabel is the well-known label Kubernetes attaches to
|
|
|
|
|
// every Pod owned by a ReplicaSet so the ReplicaSet name can be
|
|
|
|
|
// reconstructed as "<deploy>-<hash>". We use it to peel the hash back off
|
|
|
|
|
// in deriveAppName.
|
|
|
|
|
const podTemplateHashLabel = "pod-template-hash"
|
|
|
|
|
|
|
|
|
|
// deriveAppName returns the stable workload identifier for a Pod — the
|
|
|
|
|
// name of the topmost stable controller, with the pod-template-hash
|
|
|
|
|
// stripped for ReplicaSet-owned pods.
|
|
|
|
|
//
|
|
|
|
|
// The rule maps to Kubernetes pod-name generation:
|
|
|
|
|
//
|
|
|
|
|
// Deployment → ReplicaSet → Pod pod owner is RS named "<deploy>-<hash>";
|
|
|
|
|
// strip the trailing "-<hash>" to recover
|
|
|
|
|
// the Deployment name.
|
|
|
|
|
// StatefulSet → Pod pod owner is the STS itself; use as-is.
|
|
|
|
|
// DaemonSet → Pod pod owner is the DS itself; use as-is.
|
|
|
|
|
// Job → Pod pod owner is the Job itself; use as-is.
|
|
|
|
|
// (bare pod) → Pod no controller owner; fall back to pod name.
|
|
|
|
|
//
|
|
|
|
|
// All replicas of the same workload converge on the same return value,
|
|
|
|
|
// which is the property the ip-algo `app` field needs.
|
|
|
|
|
func deriveAppName(pod *corev1.Pod) string {
|
|
|
|
|
owner := controllerOwner(pod)
|
|
|
|
|
if owner == nil {
|
|
|
|
|
return pod.Name
|
|
|
|
|
}
|
|
|
|
|
if owner.Kind == "ReplicaSet" {
|
|
|
|
|
if hash, ok := pod.Labels[podTemplateHashLabel]; ok && hash != "" {
|
|
|
|
|
suffix := "-" + hash
|
|
|
|
|
if strings.HasSuffix(owner.Name, suffix) {
|
|
|
|
|
return strings.TrimSuffix(owner.Name, suffix)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Custom controller named the RS something that doesn't match
|
|
|
|
|
// the pod-template-hash convention. Falling back to the RS name
|
|
|
|
|
// keeps replicas of the same RS aligned, which is the second-
|
|
|
|
|
// best correctness we can offer.
|
|
|
|
|
return owner.Name
|
|
|
|
|
}
|
|
|
|
|
return owner.Name
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// controllerOwner returns the OwnerReference flagged with Controller=true,
|
|
|
|
|
// or nil if none. Kubernetes guarantees at most one controller per object.
|
|
|
|
|
func controllerOwner(pod *corev1.Pod) *metav1OwnerLite {
|
|
|
|
|
for i := range pod.OwnerReferences {
|
|
|
|
|
o := &pod.OwnerReferences[i]
|
|
|
|
|
if o.Controller != nil && *o.Controller {
|
|
|
|
|
return &metav1OwnerLite{Kind: o.Kind, Name: o.Name}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// metav1OwnerLite is the slice of OwnerReference we actually consult,
|
|
|
|
|
// kept tiny so it can be returned by value-pointer cheaply.
|
|
|
|
|
type metav1OwnerLite struct {
|
|
|
|
|
Kind string
|
|
|
|
|
Name string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// podImageRef returns a deterministic image reference for the embed
|
|
|
|
|
// `image` field. We use the first container's spec'd image — this is
|
|
|
|
|
// stable across replicas of the same Deployment without requiring the
|
|
|
|
|
// runtime-resolved digest. Empty string if the pod has no containers,
|
|
|
|
|
// in which case the embed package falls back to FNV(containerID).
|
|
|
|
|
func podImageRef(pod *corev1.Pod) string {
|
|
|
|
|
if len(pod.Spec.Containers) == 0 {
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
return pod.Spec.Containers[0].Image
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-24 22:33:48 -05:00
|
|
|
// PodHandler is the platform-agnostic ADD/DEL/CHECK implementation. It
|
|
|
|
|
// resolves the Pod from the informer cache, parses annotations, allocates
|
|
|
|
|
// from IPAM, programs netns (or skips on non-Linux build), and persists
|
|
|
|
|
// state. The netns ops are split into Setup/Teardown so platform stubs can
|
|
|
|
|
// keep the rest of the orchestration testable.
|
|
|
|
|
type PodHandler struct {
|
|
|
|
|
Node string
|
|
|
|
|
Store *Store
|
|
|
|
|
IPAM *IPAM
|
|
|
|
|
Pods *PodCache
|
|
|
|
|
NodeConfig *NodeConfigCache
|
2026-04-25 11:09:09 -05:00
|
|
|
Logger *slog.Logger
|
2026-04-24 22:33:48 -05:00
|
|
|
// SetupFunc and TeardownFunc are injected at startup; in production
|
|
|
|
|
// they point at the Linux netlink ops, in tests they're fakes.
|
|
|
|
|
SetupFunc func(SetupRequest) error
|
|
|
|
|
TeardownFunc func(containerID string, ip6, ip4 net.IP) error
|
|
|
|
|
// AfterCommit is called after a successful ADD/DEL with the
|
|
|
|
|
// post-mutation Snapshot — used to refresh BIRD config.
|
|
|
|
|
AfterCommit func()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add implements the CNI ADD path.
|
|
|
|
|
func (h *PodHandler) Add(ctx context.Context, req flockcni.Request) (*current.Result, error) {
|
|
|
|
|
args := ParseCNIArgs(req.Args)
|
|
|
|
|
if args.PodName == "" || args.PodNamespace == "" {
|
|
|
|
|
return nil, fmt.Errorf("CNI_ARGS missing K8S_POD_NAMESPACE/NAME")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Idempotency: if we already committed this containerID, return the
|
|
|
|
|
// existing IPs. kubelet retries ADD on the same sandbox.
|
|
|
|
|
if existing, ok := h.Store.Get(req.ContainerID); ok && existing.State == StateCommitted {
|
|
|
|
|
return resultFromAllocation(req.IfName, existing), nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pod, err := h.Pods.WaitForPod(ctx, args.PodNamespace, args.PodName, 3*time.Second)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("lookup pod: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-25 11:09:09 -05:00
|
|
|
nc := h.NodeConfig.Load()
|
|
|
|
|
defaults := FamilyDefaultsFromNodeConfig(nc)
|
2026-04-25 09:25:45 -05:00
|
|
|
parsed, err := ParseAnnotations(pod.Annotations, defaults)
|
2026-04-24 22:33:48 -05:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("parse annotations: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-25 11:09:09 -05:00
|
|
|
var nodeAnn map[string]string
|
|
|
|
|
if nc != nil {
|
|
|
|
|
nodeAnn = nc.GetAnnotations()
|
|
|
|
|
}
|
|
|
|
|
ipAlgo := ResolveIPAlgo(pod.Annotations, nodeAnn, h.Logger)
|
|
|
|
|
|
2026-04-24 22:33:48 -05:00
|
|
|
allocReq := AllocRequest{
|
|
|
|
|
ContainerID: req.ContainerID,
|
|
|
|
|
Namespace: args.PodNamespace,
|
|
|
|
|
Pod: args.PodName,
|
2026-04-25 11:42:06 -05:00
|
|
|
App: deriveAppName(pod),
|
2026-04-24 22:33:48 -05:00
|
|
|
WantV6: parsed.WantV6,
|
|
|
|
|
WantV4: parsed.WantV4,
|
|
|
|
|
AnnCIDR6: parsed.CIDR6,
|
|
|
|
|
AnnCIDR4: parsed.CIDR4,
|
2026-04-25 11:09:09 -05:00
|
|
|
IPAlgo: ipAlgo,
|
2026-04-25 11:42:06 -05:00
|
|
|
Image: podImageRef(pod),
|
2026-04-24 22:33:48 -05:00
|
|
|
}
|
|
|
|
|
res, err := h.IPAM.Allocate(allocReq)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("ipam: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Persist pending entry before any netlink work so a crash mid-ADD
|
|
|
|
|
// leaves recoverable state.
|
|
|
|
|
pending := Allocation{
|
|
|
|
|
ContainerID: req.ContainerID,
|
|
|
|
|
Namespace: args.PodNamespace,
|
|
|
|
|
PodName: args.PodName,
|
|
|
|
|
OwnerUID: string(pod.UID),
|
|
|
|
|
IP6: ipString(res.IP6),
|
|
|
|
|
IP4: ipString(res.IP4),
|
2026-04-25 07:36:47 -05:00
|
|
|
Anycast: anycastStrings(parsed.Anycast),
|
2026-04-28 17:50:49 -05:00
|
|
|
Addresses: anycastStrings(parsed.Addresses),
|
2026-04-24 22:33:48 -05:00
|
|
|
State: StatePending,
|
|
|
|
|
AllocatedAt: time.Now().UTC(),
|
|
|
|
|
}
|
|
|
|
|
if err := h.Store.Upsert(pending); err != nil {
|
|
|
|
|
h.IPAM.Release(res.IP6, res.IP4)
|
|
|
|
|
return nil, fmt.Errorf("store pending: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
setup := SetupRequest{
|
|
|
|
|
ContainerID: req.ContainerID,
|
|
|
|
|
Netns: req.Netns,
|
|
|
|
|
IfName: req.IfName,
|
|
|
|
|
HostIface: HostIfaceName(req.ContainerID),
|
|
|
|
|
IP6: res.IP6,
|
|
|
|
|
IP4: res.IP4,
|
2026-04-25 07:36:47 -05:00
|
|
|
Anycast: parsed.Anycast,
|
2026-04-28 17:50:49 -05:00
|
|
|
Addresses: parsed.Addresses,
|
2026-04-24 22:33:48 -05:00
|
|
|
}
|
|
|
|
|
if err := h.SetupFunc(setup); err != nil {
|
|
|
|
|
// Roll forward: leave pending entry in place so startup GC can clean
|
|
|
|
|
// up the partial netns; let kubelet retry ADD.
|
|
|
|
|
return nil, fmt.Errorf("netns setup: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
committed := pending
|
|
|
|
|
committed.State = StateCommitted
|
|
|
|
|
if err := h.Store.Upsert(committed); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("store commit: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if h.AfterCommit != nil {
|
|
|
|
|
h.AfterCommit()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return resultFromAllocation(req.IfName, committed), nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Del implements CNI DEL. Idempotent.
|
|
|
|
|
func (h *PodHandler) Del(ctx context.Context, req flockcni.Request) error {
|
|
|
|
|
entry, ok := h.Store.Get(req.ContainerID)
|
|
|
|
|
if !ok {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
ip6 := net.ParseIP(entry.IP6)
|
|
|
|
|
ip4 := net.ParseIP(entry.IP4)
|
|
|
|
|
|
|
|
|
|
if err := h.TeardownFunc(req.ContainerID, ip6, ip4); err != nil {
|
|
|
|
|
return fmt.Errorf("netns teardown: %w", err)
|
|
|
|
|
}
|
|
|
|
|
if err := h.Store.Delete(req.ContainerID); err != nil {
|
|
|
|
|
return fmt.Errorf("store delete: %w", err)
|
|
|
|
|
}
|
|
|
|
|
h.IPAM.Release(ip6, ip4)
|
|
|
|
|
if h.AfterCommit != nil {
|
|
|
|
|
h.AfterCommit()
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check verifies that the persisted state is consistent. M2 minimum: just
|
|
|
|
|
// look up the entry; full kernel-state comparison is M7.
|
|
|
|
|
func (h *PodHandler) Check(_ context.Context, req flockcni.Request) error {
|
|
|
|
|
if _, ok := h.Store.Get(req.ContainerID); !ok {
|
|
|
|
|
return cnitypes.NewError(cnitypes.ErrUnknownContainer, "flock-check",
|
|
|
|
|
"container "+req.ContainerID+" has no allocation")
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func resultFromAllocation(ifName string, a Allocation) *current.Result {
|
|
|
|
|
r := ¤t.Result{CNIVersion: current.ImplementedSpecVersion}
|
|
|
|
|
r.Interfaces = []*current.Interface{{Name: ifName, Sandbox: "pod"}}
|
|
|
|
|
if a.IP6 != "" {
|
|
|
|
|
ip6 := net.ParseIP(a.IP6)
|
|
|
|
|
r.IPs = append(r.IPs, ¤t.IPConfig{
|
|
|
|
|
Interface: intPtr(0),
|
|
|
|
|
Address: net.IPNet{IP: ip6, Mask: net.CIDRMask(128, 128)},
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
if a.IP4 != "" {
|
|
|
|
|
ip4 := net.ParseIP(a.IP4).To4()
|
|
|
|
|
r.IPs = append(r.IPs, ¤t.IPConfig{
|
|
|
|
|
Interface: intPtr(0),
|
|
|
|
|
Address: net.IPNet{IP: ip4, Mask: net.CIDRMask(32, 32)},
|
|
|
|
|
})
|
|
|
|
|
}
|
2026-04-28 18:11:17 -05:00
|
|
|
// Addresses are assigned to eth0 and should appear in pod.status.podIPs
|
|
|
|
|
// so Kubernetes and workloads that inspect pod metadata see them.
|
|
|
|
|
for _, s := range a.Addresses {
|
|
|
|
|
ip := net.ParseIP(s)
|
|
|
|
|
if ip == nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if v4 := ip.To4(); v4 != nil {
|
|
|
|
|
r.IPs = append(r.IPs, ¤t.IPConfig{
|
|
|
|
|
Interface: intPtr(0),
|
|
|
|
|
Address: net.IPNet{IP: v4, Mask: net.CIDRMask(32, 32)},
|
|
|
|
|
})
|
|
|
|
|
} else {
|
|
|
|
|
r.IPs = append(r.IPs, ¤t.IPConfig{
|
|
|
|
|
Interface: intPtr(0),
|
|
|
|
|
Address: net.IPNet{IP: ip.To16(), Mask: net.CIDRMask(128, 128)},
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-24 22:33:48 -05:00
|
|
|
return r
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func intPtr(i int) *int { return &i }
|
|
|
|
|
func ipString(ip net.IP) string {
|
|
|
|
|
if ip == nil {
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
return canonical(ip)
|
|
|
|
|
}
|
2026-04-25 07:36:47 -05:00
|
|
|
|
|
|
|
|
func anycastStrings(ips []net.IP) []string {
|
|
|
|
|
if len(ips) == 0 {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
out := make([]string, len(ips))
|
|
|
|
|
for i, ip := range ips {
|
|
|
|
|
out[i] = canonical(ip)
|
|
|
|
|
}
|
|
|
|
|
return out
|
|
|
|
|
}
|