Files
device-mapping-manager/internal/cgroup/v2.go
Nadeem Douba 752fc9b8ba Added support for v1 and v2 cgroups
No longer using a plugin to deploy solution
2023-07-03 09:58:18 -04:00

207 lines
6.7 KiB
Go

//go:build linux
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cgroup
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"golang.org/x/sys/unix"
)
const (
BpfProgramLicense = "Apache"
)
// GetDeviceCGroupMountPath returns the mount path (and its prefix) for the device cgroup controller associated with pid
func (c *cgroupv2) GetDeviceCGroupMountPath(procRootPath string, pid int) (string, string, error) {
// Open the pid's mountinfo file in /proc.
path := fmt.Sprintf(filepath.Join(procRootPath, "proc", "%v", "mountinfo"), pid)
file, err := os.Open(path)
if err != nil {
return "", "", err
}
defer file.Close()
// Create a scanner to loop through the file's contents.
scanner := bufio.NewScanner(file)
scanner.Split(bufio.ScanLines)
// Loop through the file looking for a subsystem of '' (i.e. unified) entry.
for scanner.Scan() {
// Split each entry by '[space]'
parts := strings.Split(scanner.Text(), " ")
if len(parts) < 5 {
return "", "", fmt.Errorf("malformed mountinfo entry: %v", scanner.Text())
}
// Look for an entry with cgroup2 as the mount type.
if parts[len(parts)-3] != "cgroup2" {
continue
}
// Make sure the mount prefix is not a relative path.
if strings.HasPrefix(parts[3], "/..") {
return "", "", fmt.Errorf("relative path in mount prefix: %v", parts[3])
}
// Return the 3rd element as the prefix of the mount point for
// the devices cgroup and the 4th element as the mount point of
// the devices cgroup itself.
return parts[3], parts[4], nil
}
return "", "", fmt.Errorf("no cgroup2 filesystem in mountinfo file")
}
// GetDeviceCGroupRootPath returns the root path for the device cgroup controller associated with pid
func (c *cgroupv2) GetDeviceCGroupRootPath(procRootPath string, prefix string, pid int) (string, error) {
// Open the pid's cgroup file in /proc.
path := fmt.Sprintf(filepath.Join(procRootPath, "proc", "%v", "cgroup"), pid)
file, err := os.Open(path)
if err != nil {
return "", err
}
defer file.Close()
// Create a scanner to loop through the file's contents.
scanner := bufio.NewScanner(file)
scanner.Split(bufio.ScanLines)
// Loop through the file looking for either a '' (i.e. unified) entry.
for scanner.Scan() {
// Split each entry by ':'
parts := strings.SplitN(scanner.Text(), ":", 3)
if len(parts) != 3 {
return "", fmt.Errorf("malformed cgroup entry: %v", scanner.Text())
}
// Look for the (empty) subsystem in the 1st element.
if parts[1] != "" {
continue
}
// Return the cgroup root from the 2nd element
// (with the prefix possibly stripped off).
if prefix == "/" {
return parts[2], nil
}
return strings.TrimPrefix(parts[2], prefix), nil
}
return "", fmt.Errorf("no cgroupv2 entries in file")
}
// AddDeviceRules adds a set of device rules for the device cgroup at cgroupPath
func (c *cgroupv2) AddDeviceRules(cgroupPath string, rules []DeviceRule) error {
// Open the cgroup path.
dirFD, err := unix.Open(cgroupPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600)
if err != nil {
return fmt.Errorf("unable to open the cgroup path: %v", err)
}
defer unix.Close(dirFD)
// Find any existing eBPF device filter programs attached to this cgroup.
oldProgs, err := FindAttachedCgroupDeviceFilters(dirFD)
if err != nil {
return fmt.Errorf("unable to find any existing device filters attached to the cgroup: %v", err)
}
// Generate a new set of eBPF programs by prepending instructions for the
// new devices to the instructions of each existing program.
// If no existing programs found, create a new program with just our device filter.
var newProgs []*ebpf.Program
if len(oldProgs) == 0 {
oldInsts := asm.Instructions{asm.Return()}
newProg, err := generateNewProgram(rules, oldInsts)
if err != nil {
return fmt.Errorf("unable to generate new device filter program with no existing programs: %v", err)
}
newProgs = append(newProgs, newProg)
}
for _, oldProg := range oldProgs {
oldInfo, err := oldProg.Info()
if err != nil {
return fmt.Errorf("unable to get Info() of the original device filters program: %v", err)
}
oldInsts, err := oldInfo.Instructions()
if err != nil {
return fmt.Errorf("unable to get the instructions of the original device filters program: %v", err)
}
newProg, err := generateNewProgram(rules, oldInsts)
if err != nil {
return fmt.Errorf("unable to generate new device filter program from existing programs: %v", err)
}
newProgs = append(newProgs, newProg)
}
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error below.
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Replace the set of existing eBPF programs with the new ones.
// We don't have to worry about atomically replacing each program (i.e. by
// using BPF_F_REPLACE) because we know that the code here is always run
// strictly *before* a container begins executing.
for _, oldProg := range oldProgs {
err = DetachCgroupDeviceFilter(oldProg, dirFD)
if err != nil {
return fmt.Errorf("unable to detach original device filters program: %v", err)
}
}
for _, newProg := range newProgs {
err = AttachCgroupDeviceFilter(newProg, dirFD)
if err != nil {
return fmt.Errorf("unable to attach new device filters program: %v", err)
}
}
return nil
}
func generateNewProgram(rules []DeviceRule, oldInsts asm.Instructions) (*ebpf.Program, error) {
// Prepend instructions for the new devices to the original set of instructions.
newInsts, err := PrependDeviceFilter(rules, oldInsts)
if err != nil {
return nil, fmt.Errorf("unable to prepend new device filters to the original device filters program: %v", err)
}
// Generate new eBPF program for the merged device filter instructions.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: newInsts,
License: BpfProgramLicense,
}
newProg, err := ebpf.NewProgram(spec)
if err != nil {
return nil, fmt.Errorf("unable to create new device filters program: %v", err)
}
return newProg, nil
}