forked from Ivasoft/device-mapping-manager
300 lines
9.2 KiB
Go
300 lines
9.2 KiB
Go
//go:build linux
|
|
|
|
/*
|
|
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
// The implementation of the device filter eBPF program in this file is based on:
|
|
// https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
|
|
//
|
|
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
|
|
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
|
|
//
|
|
// Much of the go code in this file is borrowed heavily from the (Apache licensed) file found here:
|
|
// https://github.com/opencontainers/runc/blob/8e7ab26104352f4214ab1daec5c1d4bf75eddb54/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
|
|
|
|
package cgroup
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"runtime"
|
|
"unsafe"
|
|
|
|
"github.com/cilium/ebpf"
|
|
"github.com/cilium/ebpf/asm"
|
|
"github.com/cilium/ebpf/link"
|
|
"github.com/google/uuid"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
func nilCloser() error {
|
|
return nil
|
|
}
|
|
|
|
type program struct {
|
|
insts asm.Instructions
|
|
hasWildCard bool
|
|
blockID int
|
|
}
|
|
|
|
func (p *program) init() {
|
|
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
|
|
/*
|
|
u32 access_type
|
|
u32 major
|
|
u32 minor
|
|
*/
|
|
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
|
|
p.insts = append(p.insts,
|
|
asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
|
|
|
|
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
|
|
p.insts = append(p.insts,
|
|
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
|
|
// RSh: bitwise shift right
|
|
asm.RSh.Imm32(asm.R3, 16))
|
|
|
|
// R4 <- major (u32 major at R1[4])
|
|
p.insts = append(p.insts,
|
|
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
|
|
|
|
// R5 <- minor (u32 minor at R1[8])
|
|
p.insts = append(p.insts,
|
|
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
|
|
}
|
|
|
|
// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
|
|
func (p *program) appendDevice(dev specs.LinuxDeviceCgroup, labelPrefix string) error {
|
|
if p.blockID < 0 {
|
|
return errors.New("the program is finalized")
|
|
}
|
|
if p.hasWildCard {
|
|
// All entries after wildcard entry are ignored
|
|
return nil
|
|
}
|
|
|
|
bpfType := int32(-1)
|
|
hasType := true
|
|
switch dev.Type {
|
|
case string('c'):
|
|
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
|
|
case string('b'):
|
|
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
|
|
case string('a'):
|
|
hasType = false
|
|
default:
|
|
// if not specified in OCI json, typ is set to DeviceTypeAll
|
|
return fmt.Errorf("invalid DeviceType %q", dev.Type)
|
|
}
|
|
if *dev.Major > math.MaxUint32 {
|
|
return fmt.Errorf("invalid major %d", *dev.Major)
|
|
}
|
|
if *dev.Minor > math.MaxUint32 {
|
|
return fmt.Errorf("invalid minor %d", *dev.Major)
|
|
}
|
|
hasMajor := *dev.Major >= 0 // if not specified in OCI json, major is set to -1
|
|
hasMinor := *dev.Minor >= 0
|
|
bpfAccess := int32(0)
|
|
for _, r := range dev.Access {
|
|
switch r {
|
|
case 'r':
|
|
bpfAccess |= unix.BPF_DEVCG_ACC_READ
|
|
case 'w':
|
|
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
|
|
case 'm':
|
|
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
|
|
default:
|
|
return fmt.Errorf("unknown device access %v", r)
|
|
}
|
|
}
|
|
// If the access is rwm, skip the check.
|
|
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
|
|
|
|
blockSym := fmt.Sprintf("%s-block-%d", labelPrefix, p.blockID)
|
|
nextBlockSym := fmt.Sprintf("%s-block-%d", labelPrefix, p.blockID+1)
|
|
prevBlockLastIdx := len(p.insts) - 1
|
|
if hasType {
|
|
p.insts = append(p.insts,
|
|
// if (R2 != bpfType) goto next
|
|
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
|
|
)
|
|
}
|
|
if hasAccess {
|
|
p.insts = append(p.insts,
|
|
// if (R3 & bpfAccess == 0 /* use R2 as a temp var */) goto next
|
|
asm.Mov.Reg32(asm.R2, asm.R3),
|
|
asm.And.Imm32(asm.R2, bpfAccess),
|
|
asm.JEq.Imm(asm.R2, 0, nextBlockSym),
|
|
)
|
|
}
|
|
if hasMajor {
|
|
p.insts = append(p.insts,
|
|
// if (R4 != major) goto next
|
|
asm.JNE.Imm(asm.R4, int32(*dev.Major), nextBlockSym),
|
|
)
|
|
}
|
|
if hasMinor {
|
|
p.insts = append(p.insts,
|
|
// if (R5 != minor) goto next
|
|
asm.JNE.Imm(asm.R5, int32(*dev.Minor), nextBlockSym),
|
|
)
|
|
}
|
|
if !hasType && !hasAccess && !hasMajor && !hasMinor {
|
|
p.hasWildCard = true
|
|
}
|
|
p.insts = append(p.insts, p.acceptBlock(dev.Allow)...)
|
|
// set blockSym to the first instruction we added in this iteration
|
|
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
|
|
p.blockID++
|
|
return nil
|
|
}
|
|
|
|
func (p *program) acceptBlock(accept bool) asm.Instructions {
|
|
v := int32(0)
|
|
if accept {
|
|
v = 1
|
|
}
|
|
return []asm.Instruction{
|
|
// R0 <- v
|
|
asm.Mov.Imm32(asm.R0, v),
|
|
asm.Return(),
|
|
}
|
|
}
|
|
|
|
func (p *program) finalize(origInsts asm.Instructions, labelPrefix string) (asm.Instructions, error) {
|
|
lenInsts := len(p.insts)
|
|
// set blockSym to the first instruction of origInsts so we are able to jump to it properly
|
|
blockSym := fmt.Sprintf("%s-block-%d", labelPrefix, p.blockID)
|
|
p.insts = append(p.insts, origInsts...)
|
|
p.insts[lenInsts] = p.insts[lenInsts].Sym(blockSym)
|
|
p.blockID = -1
|
|
return p.insts, nil
|
|
}
|
|
|
|
// FindAttachedCgroupDeviceFilters finds all ebpf prgrams associated with 'dirFd' that control device access
|
|
func FindAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
|
|
type bpfAttrQuery struct {
|
|
TargetFd uint32
|
|
AttachType uint32
|
|
QueryType uint32
|
|
AttachFlags uint32
|
|
ProgIds uint64 // __aligned_u64
|
|
ProgCnt uint32
|
|
}
|
|
|
|
// Currently you can only have 64 eBPF programs attached to a cgroup.
|
|
size := 64
|
|
retries := 0
|
|
for retries < 10 {
|
|
progIds := make([]uint32, size)
|
|
query := bpfAttrQuery{
|
|
TargetFd: uint32(dirFd),
|
|
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
|
|
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
|
|
ProgCnt: uint32(len(progIds)),
|
|
}
|
|
|
|
// Fetch the list of program ids.
|
|
_, _, errno := unix.Syscall(unix.SYS_BPF,
|
|
uintptr(unix.BPF_PROG_QUERY),
|
|
uintptr(unsafe.Pointer(&query)),
|
|
unsafe.Sizeof(query))
|
|
size = int(query.ProgCnt)
|
|
runtime.KeepAlive(query)
|
|
if errno != 0 {
|
|
// On ENOSPC we get the correct number of programs.
|
|
if errno == unix.ENOSPC {
|
|
retries++
|
|
continue
|
|
}
|
|
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
|
|
}
|
|
|
|
// Convert the ids to program handles.
|
|
progIds = progIds[:size]
|
|
programs := make([]*ebpf.Program, 0, len(progIds))
|
|
for _, progId := range progIds {
|
|
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
|
|
if err != nil {
|
|
// We skip over programs that give us -EACCES or -EPERM. This
|
|
// is necessary because there may be BPF programs that have
|
|
// been attached (such as with --systemd-cgroup) which have an
|
|
// LSM label that blocks us from interacting with the program.
|
|
//
|
|
// Because additional BPF_CGROUP_DEVICE programs only can add
|
|
// restrictions, there's no real issue with just ignoring these
|
|
// programs (and stops runc from breaking on distributions with
|
|
// very strict SELinux policies).
|
|
if errors.Is(err, os.ErrPermission) {
|
|
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
|
|
continue
|
|
}
|
|
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
|
|
}
|
|
programs = append(programs, program)
|
|
}
|
|
runtime.KeepAlive(progIds)
|
|
return programs, nil
|
|
}
|
|
|
|
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
|
|
}
|
|
|
|
// PrependDeviceFilter prepends a set of instructions for further device filtering to an existing device filtering ebpf program
|
|
func PrependDeviceFilter(devices []specs.LinuxDeviceCgroup, origInsts asm.Instructions) (asm.Instructions, error) {
|
|
labelPrefix := uuid.New().String()
|
|
p := &program{}
|
|
p.init()
|
|
for i := len(devices) - 1; i >= 0; i-- {
|
|
if err := p.appendDevice(devices[i], labelPrefix); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
insts, err := p.finalize(origInsts, labelPrefix)
|
|
return insts, err
|
|
}
|
|
|
|
// DetachCgroupDeviceFilter detaches an existing device filter ebpf program from a cgroup.
|
|
func DetachCgroupDeviceFilter(prog *ebpf.Program, dirFd int) error {
|
|
err := link.RawDetachProgram(link.RawDetachProgramOptions{
|
|
Target: dirFd,
|
|
Program: prog,
|
|
Attach: ebpf.AttachCGroupDevice,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// AttachCgroupDeviceFilter attaches a new device filter ebpf program to a cgroup.
|
|
func AttachCgroupDeviceFilter(prog *ebpf.Program, dirFd int) error {
|
|
err := link.RawAttachProgram(link.RawAttachProgramOptions{
|
|
Target: dirFd,
|
|
Program: prog,
|
|
Attach: ebpf.AttachCGroupDevice,
|
|
Flags: unix.BPF_F_ALLOW_MULTI,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
|
|
}
|
|
return nil
|
|
}
|