forked from Ivasoft/device-mapping-manager
207 lines
6.7 KiB
Go
207 lines
6.7 KiB
Go
//go:build linux
|
|
|
|
/*
|
|
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package cgroup
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/cilium/ebpf"
|
|
"github.com/cilium/ebpf/asm"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
BpfProgramLicense = "Apache"
|
|
)
|
|
|
|
// GetDeviceCGroupMountPath returns the mount path (and its prefix) for the device cgroup controller associated with pid
|
|
func (c *cgroupv2) GetDeviceCGroupMountPath(procRootPath string, pid int) (string, string, error) {
|
|
// Open the pid's mountinfo file in /proc.
|
|
path := fmt.Sprintf(filepath.Join(procRootPath, "proc", "%v", "mountinfo"), pid)
|
|
file, err := os.Open(path)
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
defer file.Close()
|
|
|
|
// Create a scanner to loop through the file's contents.
|
|
scanner := bufio.NewScanner(file)
|
|
scanner.Split(bufio.ScanLines)
|
|
|
|
// Loop through the file looking for a subsystem of '' (i.e. unified) entry.
|
|
for scanner.Scan() {
|
|
// Split each entry by '[space]'
|
|
parts := strings.Split(scanner.Text(), " ")
|
|
if len(parts) < 5 {
|
|
return "", "", fmt.Errorf("malformed mountinfo entry: %v", scanner.Text())
|
|
}
|
|
// Look for an entry with cgroup2 as the mount type.
|
|
if parts[len(parts)-3] != "cgroup2" {
|
|
continue
|
|
}
|
|
// Make sure the mount prefix is not a relative path.
|
|
if strings.HasPrefix(parts[3], "/..") {
|
|
return "", "", fmt.Errorf("relative path in mount prefix: %v", parts[3])
|
|
}
|
|
// Return the 3rd element as the prefix of the mount point for
|
|
// the devices cgroup and the 4th element as the mount point of
|
|
// the devices cgroup itself.
|
|
return parts[3], parts[4], nil
|
|
}
|
|
|
|
return "", "", fmt.Errorf("no cgroup2 filesystem in mountinfo file")
|
|
}
|
|
|
|
// GetDeviceCGroupRootPath returns the root path for the device cgroup controller associated with pid
|
|
func (c *cgroupv2) GetDeviceCGroupRootPath(procRootPath string, prefix string, pid int) (string, error) {
|
|
// Open the pid's cgroup file in /proc.
|
|
path := fmt.Sprintf(filepath.Join(procRootPath, "proc", "%v", "cgroup"), pid)
|
|
file, err := os.Open(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer file.Close()
|
|
|
|
// Create a scanner to loop through the file's contents.
|
|
scanner := bufio.NewScanner(file)
|
|
scanner.Split(bufio.ScanLines)
|
|
|
|
// Loop through the file looking for either a '' (i.e. unified) entry.
|
|
for scanner.Scan() {
|
|
// Split each entry by ':'
|
|
parts := strings.SplitN(scanner.Text(), ":", 3)
|
|
if len(parts) != 3 {
|
|
return "", fmt.Errorf("malformed cgroup entry: %v", scanner.Text())
|
|
}
|
|
// Look for the (empty) subsystem in the 1st element.
|
|
if parts[1] != "" {
|
|
continue
|
|
}
|
|
// Return the cgroup root from the 2nd element
|
|
// (with the prefix possibly stripped off).
|
|
if prefix == "/" {
|
|
return parts[2], nil
|
|
}
|
|
return strings.TrimPrefix(parts[2], prefix), nil
|
|
}
|
|
|
|
return "", fmt.Errorf("no cgroupv2 entries in file")
|
|
}
|
|
|
|
// AddDeviceRules adds a set of device rules for the device cgroup at cgroupPath
|
|
func (c *cgroupv2) AddDeviceRules(cgroupPath string, rules []DeviceRule) error {
|
|
// Open the cgroup path.
|
|
dirFD, err := unix.Open(cgroupPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to open the cgroup path: %v", err)
|
|
}
|
|
defer unix.Close(dirFD)
|
|
|
|
// Find any existing eBPF device filter programs attached to this cgroup.
|
|
oldProgs, err := FindAttachedCgroupDeviceFilters(dirFD)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to find any existing device filters attached to the cgroup: %v", err)
|
|
}
|
|
|
|
// Generate a new set of eBPF programs by prepending instructions for the
|
|
// new devices to the instructions of each existing program.
|
|
// If no existing programs found, create a new program with just our device filter.
|
|
var newProgs []*ebpf.Program
|
|
if len(oldProgs) == 0 {
|
|
oldInsts := asm.Instructions{asm.Return()}
|
|
|
|
newProg, err := generateNewProgram(rules, oldInsts)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to generate new device filter program with no existing programs: %v", err)
|
|
}
|
|
|
|
newProgs = append(newProgs, newProg)
|
|
}
|
|
for _, oldProg := range oldProgs {
|
|
oldInfo, err := oldProg.Info()
|
|
if err != nil {
|
|
return fmt.Errorf("unable to get Info() of the original device filters program: %v", err)
|
|
}
|
|
|
|
oldInsts, err := oldInfo.Instructions()
|
|
if err != nil {
|
|
return fmt.Errorf("unable to get the instructions of the original device filters program: %v", err)
|
|
}
|
|
|
|
newProg, err := generateNewProgram(rules, oldInsts)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to generate new device filter program from existing programs: %v", err)
|
|
}
|
|
|
|
newProgs = append(newProgs, newProg)
|
|
}
|
|
|
|
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error below.
|
|
// This limit is not inherited into the container.
|
|
memlockLimit := &unix.Rlimit{
|
|
Cur: unix.RLIM_INFINITY,
|
|
Max: unix.RLIM_INFINITY,
|
|
}
|
|
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
|
|
|
|
// Replace the set of existing eBPF programs with the new ones.
|
|
// We don't have to worry about atomically replacing each program (i.e. by
|
|
// using BPF_F_REPLACE) because we know that the code here is always run
|
|
// strictly *before* a container begins executing.
|
|
for _, oldProg := range oldProgs {
|
|
err = DetachCgroupDeviceFilter(oldProg, dirFD)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to detach original device filters program: %v", err)
|
|
}
|
|
}
|
|
for _, newProg := range newProgs {
|
|
err = AttachCgroupDeviceFilter(newProg, dirFD)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to attach new device filters program: %v", err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func generateNewProgram(rules []DeviceRule, oldInsts asm.Instructions) (*ebpf.Program, error) {
|
|
// Prepend instructions for the new devices to the original set of instructions.
|
|
newInsts, err := PrependDeviceFilter(rules, oldInsts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to prepend new device filters to the original device filters program: %v", err)
|
|
}
|
|
|
|
// Generate new eBPF program for the merged device filter instructions.
|
|
spec := &ebpf.ProgramSpec{
|
|
Type: ebpf.CGroupDevice,
|
|
Instructions: newInsts,
|
|
License: BpfProgramLicense,
|
|
}
|
|
newProg, err := ebpf.NewProgram(spec)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to create new device filters program: %v", err)
|
|
}
|
|
|
|
return newProg, nil
|
|
}
|