M2 plumbing: CNI ↔ agent JSON RPC over unix socket
Build flock Image / build (push) Has been cancelled

Locks the wire format between /opt/cni/bin/flock and flock-agent. ADD
returns a CNI Result, DEL returns success/error, CHECK returns
success/error. Connection-per-RPC, newline-delimited JSON.

- pkg/cni/rpc.go: shared Op + Request + Response + framed encode/decode.
- pkg/cni/rpc_client.go: net.Dial + EncodeRequest + DecodeResponse;
  rpcSocket overridable for tests.
- pkg/cni/plugin.go: real implementations of CmdAdd/Del/Check that call
  through, mapping agent errors to types.Error.
- pkg/agent/rpc.go: rpcServer with swappable AddHandler/DelHandler/
  CheckHandler (defaults: not-implemented for ADD; idempotent-no-op for
  DEL/CHECK so kubelet teardown of a never-ADDed pod doesn't fail).
- pkg/agent/server.go: replaces the M1 accept-and-close placeholder
  with rpcServer.serve(ctx, listener); listener closes on ctx cancel.

Tests cover: Request/Response JSON roundtrip, end-to-end client →
unix-socket → fake server, agent error → CNI types.Error mapping.

ADD remains "not implemented" until netlink + IPAM wire-up — the agent
returns an error and kubelet will fail pod sandbox creation IF a node
were configured to use this CNI. host001's CNI plane is still 100%
Calico, so this changes nothing observable on the cluster.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Donavan Fritz
2026-04-24 22:21:33 -05:00
parent c09c62fbaa
commit 31fcae2a97
6 changed files with 436 additions and 28 deletions
+25 -13
View File
@@ -4,8 +4,6 @@
package cni
import (
"errors"
"github.com/containernetworking/cni/pkg/skel"
"github.com/containernetworking/cni/pkg/types"
current "github.com/containernetworking/cni/pkg/types/100"
@@ -14,24 +12,38 @@ import (
// SocketPath is the unix socket exposed by flock-agent.
const SocketPath = "/run/flock/flock.sock"
var errNotImplemented = errors.New("flock: ADD/DEL/CHECK not implemented in M1 scaffold")
// CmdAdd is invoked by kubelet when a pod sandbox is created.
func CmdAdd(args *skel.CmdArgs) error {
// M2: dial SocketPath, send ADD RPC, return CNI result.
_ = args
_ = current.ImplementedSpecVersion
return types.NewError(types.ErrInternal, "flock-add", errNotImplemented.Error())
resp, err := call(fromArgs(OpAdd, args))
if err != nil {
return types.NewError(types.ErrInternal, "flock-add", err.Error())
}
if cerr := toCNIError("add", resp); cerr != nil {
return cerr
}
if resp.Result == nil {
return types.NewError(types.ErrInternal, "flock-add", "agent returned no result")
}
return types.PrintResult(resp.Result, current.ImplementedSpecVersion)
}
// CmdDel is invoked by kubelet when a pod sandbox is torn down.
// CmdDel is invoked by kubelet when a pod sandbox is torn down. CNI spec:
// DEL must be idempotent. The agent treats a missing allocation as success.
func CmdDel(args *skel.CmdArgs) error {
_ = args
return types.NewError(types.ErrInternal, "flock-del", errNotImplemented.Error())
resp, err := call(fromArgs(OpDel, args))
if err != nil {
// On dial failure during DEL, fail loudly — kubelet retries DEL,
// and the next attempt may succeed once the agent is reachable.
return types.NewError(types.ErrInternal, "flock-del", err.Error())
}
return toCNIError("del", resp)
}
// CmdCheck verifies that the live netns matches the persisted allocation.
func CmdCheck(args *skel.CmdArgs) error {
_ = args
return types.NewError(types.ErrInternal, "flock-check", errNotImplemented.Error())
resp, err := call(fromArgs(OpCheck, args))
if err != nil {
return types.NewError(types.ErrInternal, "flock-check", err.Error())
}
return toCNIError("check", resp)
}
+87
View File
@@ -0,0 +1,87 @@
package cni
import (
"encoding/json"
"fmt"
"io"
current "github.com/containernetworking/cni/pkg/types/100"
)
// Op is the CNI verb the plugin asks the agent to perform.
type Op string
const (
OpAdd Op = "ADD"
OpDel Op = "DEL"
OpCheck Op = "CHECK"
)
// Request is sent over the unix socket from the CNI plugin to flock-agent.
// Field names mirror the kubelet → CNI invocation env vars; the agent uses
// these to look up Pod metadata via the informer cache.
type Request struct {
Op Op `json:"op"`
ContainerID string `json:"container_id"`
Netns string `json:"netns"` // /proc/<pid>/ns/net
IfName string `json:"ifname"` // typically "eth0"
Args string `json:"args"` // raw CNI_ARGS env (K=V;K=V;...)
Path string `json:"path"` // CNI_PATH (plugin search path)
StdinData []byte `json:"stdin_data"` // raw network configuration JSON
}
// Response carries either a typed CNI Result or a single error string.
// We use a string error (not a Go error) because errors traverse a JSON
// boundary; the client converts back to a CNI types.Error.
type Response struct {
Result *current.Result `json:"result,omitempty"`
Error string `json:"error,omitempty"`
}
// EncodeRequest writes req to w as a single JSON object followed by '\n'.
// The newline framing makes the wire-protocol simple to read incrementally
// without a length prefix or full-stream-buffering.
func EncodeRequest(w io.Writer, req Request) error {
b, err := json.Marshal(req)
if err != nil {
return fmt.Errorf("marshal request: %w", err)
}
b = append(b, '\n')
if _, err := w.Write(b); err != nil {
return fmt.Errorf("write request: %w", err)
}
return nil
}
// DecodeRequest reads one newline-delimited JSON request from r.
func DecodeRequest(r io.Reader) (Request, error) {
var req Request
dec := json.NewDecoder(r)
if err := dec.Decode(&req); err != nil {
return Request{}, fmt.Errorf("decode request: %w", err)
}
return req, nil
}
// EncodeResponse writes resp to w as a single JSON object followed by '\n'.
func EncodeResponse(w io.Writer, resp Response) error {
b, err := json.Marshal(resp)
if err != nil {
return fmt.Errorf("marshal response: %w", err)
}
b = append(b, '\n')
if _, err := w.Write(b); err != nil {
return fmt.Errorf("write response: %w", err)
}
return nil
}
// DecodeResponse reads one newline-delimited JSON response from r.
func DecodeResponse(r io.Reader) (Response, error) {
var resp Response
dec := json.NewDecoder(r)
if err := dec.Decode(&resp); err != nil {
return Response{}, fmt.Errorf("decode response: %w", err)
}
return resp, nil
}
+57 -3
View File
@@ -1,5 +1,59 @@
package cni
// rpc_client.go will hold the JSON-over-unix-socket client used by the CNI
// plugin to call into flock-agent. Stub for M1; implementation lands in M2
// alongside the agent's RPC server.
import (
"fmt"
"net"
"time"
"github.com/containernetworking/cni/pkg/skel"
"github.com/containernetworking/cni/pkg/types"
)
// dialTimeout bounds how long the plugin waits to connect to the agent
// socket. kubelet has its own outer timeout for the whole CNI invocation,
// but a tighter bound here gives a clearer error if the DaemonSet pod is
// gone or starting up.
const dialTimeout = 5 * time.Second
// rpcSocket is overridable for tests.
var rpcSocket = SocketPath
// call issues one Request and returns the Response. It dials the agent
// unix socket, encodes the request, and decodes a single response. The
// connection is closed before returning.
func call(req Request) (*Response, error) {
conn, err := net.DialTimeout("unix", rpcSocket, dialTimeout)
if err != nil {
return nil, fmt.Errorf("dial flock-agent at %s: %w", rpcSocket, err)
}
defer conn.Close()
if err := EncodeRequest(conn, req); err != nil {
return nil, err
}
resp, err := DecodeResponse(conn)
if err != nil {
return nil, err
}
return &resp, nil
}
// fromArgs builds a Request from a CNI skel.CmdArgs invocation.
func fromArgs(op Op, args *skel.CmdArgs) Request {
return Request{
Op: op,
ContainerID: args.ContainerID,
Netns: args.Netns,
IfName: args.IfName,
Args: args.Args,
Path: args.Path,
StdinData: args.StdinData,
}
}
// toCNIError converts an RPC Response.Error into a CNI types.Error, or nil.
func toCNIError(stage string, resp *Response) error {
if resp.Error == "" {
return nil
}
return types.NewError(types.ErrInternal, "flock-"+stage, resp.Error)
}
+125
View File
@@ -0,0 +1,125 @@
package cni
import (
"bytes"
"net"
"path/filepath"
"testing"
"time"
current "github.com/containernetworking/cni/pkg/types/100"
)
func TestEncodeDecode_RequestRoundtrip(t *testing.T) {
req := Request{
Op: OpAdd,
ContainerID: "abc",
Netns: "/proc/1234/ns/net",
IfName: "eth0",
Args: "K8S_POD_NAMESPACE=mail;K8S_POD_NAME=stalwart-0",
Path: "/opt/cni/bin",
StdinData: []byte(`{"cniVersion":"1.0.0","name":"flock"}`),
}
var buf bytes.Buffer
if err := EncodeRequest(&buf, req); err != nil {
t.Fatal(err)
}
got, err := DecodeRequest(&buf)
if err != nil {
t.Fatal(err)
}
if got.Op != req.Op || got.ContainerID != req.ContainerID || string(got.StdinData) != string(req.StdinData) {
t.Fatalf("roundtrip mismatch:\n got=%+v\nwant=%+v", got, req)
}
}
func TestEncodeDecode_ResponseRoundtrip(t *testing.T) {
resp := Response{
Result: &current.Result{CNIVersion: current.ImplementedSpecVersion},
}
var buf bytes.Buffer
if err := EncodeResponse(&buf, resp); err != nil {
t.Fatal(err)
}
got, err := DecodeResponse(&buf)
if err != nil {
t.Fatal(err)
}
if got.Result == nil || got.Result.CNIVersion != current.ImplementedSpecVersion {
t.Fatalf("response roundtrip lost CNIVersion: %+v", got)
}
}
// TestRPC_ClientToFakeServer wires the real client to a tiny in-process
// server over a unix socket, exercising end-to-end framing.
func TestRPC_ClientToFakeServer(t *testing.T) {
dir := t.TempDir()
sockPath := filepath.Join(dir, "flock.sock")
l, err := net.Listen("unix", sockPath)
if err != nil {
t.Fatal(err)
}
defer l.Close()
// Server: read one Request, write one Response.
done := make(chan error, 1)
go func() {
conn, err := l.Accept()
if err != nil {
done <- err
return
}
defer conn.Close()
req, err := DecodeRequest(conn)
if err != nil {
done <- err
return
}
var resp Response
switch req.Op {
case OpAdd:
resp.Result = &current.Result{CNIVersion: current.ImplementedSpecVersion}
case OpDel, OpCheck:
// no-op success
default:
resp.Error = "unknown op"
}
done <- EncodeResponse(conn, resp)
}()
// Point the client at our test socket.
prev := rpcSocket
rpcSocket = sockPath
defer func() { rpcSocket = prev }()
resp, err := call(Request{Op: OpAdd, ContainerID: "test"})
if err != nil {
t.Fatalf("call: %v", err)
}
if resp.Result == nil {
t.Fatalf("expected result, got %+v", resp)
}
select {
case err := <-done:
if err != nil {
t.Fatalf("server: %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("server did not finish")
}
}
func TestRPC_ServerErrorPropagatesToCNIError(t *testing.T) {
resp := &Response{Error: "no NodeConfig for host001"}
err := toCNIError("add", resp)
if err == nil {
t.Fatal("expected CNI error")
}
if got := err.Error(); got == "" || got == "no NodeConfig for host001" {
// types.Error wraps the message — just make sure something non-empty
// surfaces and that the underlying string is contained.
t.Fatalf("unexpected error format: %q", got)
}
}