Files
device-mapping-manager/internal/cgroup/ebpf.go
Nadeem Douba 752fc9b8ba Added support for v1 and v2 cgroups
No longer using a plugin to deploy solution
2023-07-03 09:58:18 -04:00

300 lines
9.2 KiB
Go

//go:build linux
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// The implementation of the device filter eBPF program in this file is based on:
// https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
//
// Much of the go code in this file is borrowed heavily from the (Apache licensed) file found here:
// https://github.com/opencontainers/runc/blob/8e7ab26104352f4214ab1daec5c1d4bf75eddb54/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
package cgroup
import (
"errors"
"fmt"
"math"
"os"
"runtime"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/google/uuid"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
type program struct {
insts asm.Instructions
hasWildCard bool
blockID int
}
func (p *program) init() {
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
/*
u32 access_type
u32 major
u32 minor
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
// RSh: bitwise shift right
asm.RSh.Imm32(asm.R3, 16))
// R4 <- major (u32 major at R1[4])
p.insts = append(p.insts,
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
// R5 <- minor (u32 minor at R1[8])
p.insts = append(p.insts,
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
func (p *program) appendDevice(dev specs.LinuxDeviceCgroup, labelPrefix string) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
if p.hasWildCard {
// All entries after wildcard entry are ignored
return nil
}
bpfType := int32(-1)
hasType := true
switch dev.Type {
case string('c'):
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case string('b'):
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
case string('a'):
hasType = false
default:
// if not specified in OCI json, typ is set to DeviceTypeAll
return fmt.Errorf("invalid DeviceType %q", dev.Type)
}
if *dev.Major > math.MaxUint32 {
return fmt.Errorf("invalid major %d", *dev.Major)
}
if *dev.Minor > math.MaxUint32 {
return fmt.Errorf("invalid minor %d", *dev.Major)
}
hasMajor := *dev.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := *dev.Minor >= 0
bpfAccess := int32(0)
for _, r := range dev.Access {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
case 'w':
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
blockSym := fmt.Sprintf("%s-block-%d", labelPrefix, p.blockID)
nextBlockSym := fmt.Sprintf("%s-block-%d", labelPrefix, p.blockID+1)
prevBlockLastIdx := len(p.insts) - 1
if hasType {
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
}
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess == 0 /* use R2 as a temp var */) goto next
asm.Mov.Reg32(asm.R2, asm.R3),
asm.And.Imm32(asm.R2, bpfAccess),
asm.JEq.Imm(asm.R2, 0, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(*dev.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(*dev.Minor), nextBlockSym),
)
}
if !hasType && !hasAccess && !hasMajor && !hasMinor {
p.hasWildCard = true
}
p.insts = append(p.insts, p.acceptBlock(dev.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) acceptBlock(accept bool) asm.Instructions {
v := int32(0)
if accept {
v = 1
}
return []asm.Instruction{
// R0 <- v
asm.Mov.Imm32(asm.R0, v),
asm.Return(),
}
}
func (p *program) finalize(origInsts asm.Instructions, labelPrefix string) (asm.Instructions, error) {
lenInsts := len(p.insts)
// set blockSym to the first instruction of origInsts so we are able to jump to it properly
blockSym := fmt.Sprintf("%s-block-%d", labelPrefix, p.blockID)
p.insts = append(p.insts, origInsts...)
p.insts[lenInsts] = p.insts[lenInsts].Sym(blockSym)
p.blockID = -1
return p.insts, nil
}
// FindAttachedCgroupDeviceFilters finds all ebpf prgrams associated with 'dirFd' that control device access
func FindAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
// PrependDeviceFilter prepends a set of instructions for further device filtering to an existing device filtering ebpf program
func PrependDeviceFilter(devices []specs.LinuxDeviceCgroup, origInsts asm.Instructions) (asm.Instructions, error) {
labelPrefix := uuid.New().String()
p := &program{}
p.init()
for i := len(devices) - 1; i >= 0; i-- {
if err := p.appendDevice(devices[i], labelPrefix); err != nil {
return nil, err
}
}
insts, err := p.finalize(origInsts, labelPrefix)
return insts, err
}
// DetachCgroupDeviceFilter detaches an existing device filter ebpf program from a cgroup.
func DetachCgroupDeviceFilter(prog *ebpf.Program, dirFd int) error {
err := link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
return nil
}
// AttachCgroupDeviceFilter attaches a new device filter ebpf program to a cgroup.
func AttachCgroupDeviceFilter(prog *ebpf.Program, dirFd int) error {
err := link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
return nil
}