src/internal/runtime/cgroup/cgroup_linux.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package cgroup

 import (
 	"internal/bytealg"
 	"internal/runtime/strconv"
 	"internal/runtime/syscall/linux"
 )

 var (
 	ErrNoCgroup error = stringError("not in a cgroup")

 	errMalformedFile error = stringError("malformed file")
 )

 const _PATH_MAX = 4096

 const (
 	// Required amount of scratch space for CPULimit.
 	//
 	// TODO(prattmic): This is shockingly large (~70KiB) due to the (very
 	// unlikely) combination of extremely long paths consisting mostly
 	// escaped characters. The scratch buffer ends up in .bss in package
 	// runtime, so it doesn't contribute to binary size and generally won't
 	// be faulted in, but it would still be nice to shrink this. A more
 	// complex parser that did not need to keep entire lines in memory
 	// could get away with much less. Alternatively, we could do a one-off
 	// mmap allocation for this buffer, which is only mapped larger if we
 	// actually need the extra space.
 	ScratchSize = PathSize + ParseSize

 	// Required space to store a path of the cgroup in the filesystem.
 	PathSize = _PATH_MAX

 	// /proc/self/mountinfo path escape sequences are 4 characters long, so
 	// a path consisting entirely of escaped characters could be 4 times
 	// larger.
 	escapedPathMax = 4 * _PATH_MAX

 	// Required space to parse /proc/self/mountinfo and /proc/self/cgroup.
 	// See findCPUMount and findCPURelativePath.
 	ParseSize = 4 * escapedPathMax
 )

 // Include explicit NUL to be sure we include it in the slice.
 const (
 	v2MaxFile    = "/cpu.max\x00"
 	v1QuotaFile  = "/cpu.cfs_quota_us\x00"
 	v1PeriodFile = "/cpu.cfs_period_us\x00"
 )

 // Version indicates the cgroup version.
 type Version int

 const (
 	VersionUnknown Version = iota
 	V1
 	V2
 )

 // CPU owns the FDs required to read the CPU limit from a cgroup.
 type CPU struct {
 	version Version

 	// For cgroup v1, this is cpu.cfs_quota_us.
 	// For cgroup v2, this is cpu.max.
 	quotaFD int

 	// For cgroup v1, this is cpu.cfs_period_us.
 	// For cgroup v2, this is unused.
 	periodFD int
 }

 func (c CPU) Close() {
 	switch c.version {
 	case V1:
 		linux.Close(c.quotaFD)
 		linux.Close(c.periodFD)
 	case V2:
 		linux.Close(c.quotaFD)
 	default:
 		throw("impossible cgroup version")
 	}
 }

 func checkBufferSize(s []byte, size int) {
 	if len(s) != size {
 		println("runtime: cgroup buffer length", len(s), "want", size)
 		throw("runtime: cgroup invalid buffer length")
 	}
 }

 // OpenCPU returns a CPU for the CPU cgroup containing the current process, or
 // ErrNoCgroup if the process is not in a CPU cgroup.
 //
 // scratch must have length ScratchSize.
 func OpenCPU(scratch []byte) (CPU, error) {
 	checkBufferSize(scratch, ScratchSize)

 	base := scratch[:PathSize]
 	scratch2 := scratch[PathSize:]

 	n, version, err := FindCPU(base, scratch2)
 	if err != nil {
 		return CPU{}, err
 	}

 	switch version {
 	case 1:
 		n2 := copy(base[n:], v1QuotaFile)
 		path := base[:n+n2]
 		quotaFD, errno := linux.Open(&path[0], linux.O_RDONLY|linux.O_CLOEXEC, 0)
 		if errno != 0 {
 			// This may fail if this process was migrated out of
 			// the cgroup found by FindCPU and that cgroup has been
 			// deleted.
 			return CPU{}, errSyscallFailed
 		}

 		n2 = copy(base[n:], v1PeriodFile)
 		path = base[:n+n2]
 		periodFD, errno := linux.Open(&path[0], linux.O_RDONLY|linux.O_CLOEXEC, 0)
 		if errno != 0 {
 			// This may fail if this process was migrated out of
 			// the cgroup found by FindCPU and that cgroup has been
 			// deleted.
 			return CPU{}, errSyscallFailed
 		}

 		c := CPU{
 			version:  1,
 			quotaFD:  quotaFD,
 			periodFD: periodFD,
 		}
 		return c, nil
 	case 2:
 		n2 := copy(base[n:], v2MaxFile)
 		path := base[:n+n2]
 		maxFD, errno := linux.Open(&path[0], linux.O_RDONLY|linux.O_CLOEXEC, 0)
 		if errno != 0 {
 			// This may fail if this process was migrated out of
 			// the cgroup found by FindCPU and that cgroup has been
 			// deleted.
 			return CPU{}, errSyscallFailed
 		}

 		c := CPU{
 			version:  2,
 			quotaFD:  maxFD,
 			periodFD: -1,
 		}
 		return c, nil
 	default:
 		throw("impossible cgroup version")
 		panic("unreachable")
 	}
 }

 // Returns average CPU throughput limit from the cgroup, or ok false if there
 // is no limit.
 func ReadCPULimit(c CPU) (float64, bool, error) {
 	switch c.version {
 	case 1:
 		quota, err := readV1Number(c.quotaFD)
 		if err != nil {
 			return 0, false, errMalformedFile
 		}

 		if quota < 0 {
 			// No limit.
 			return 0, false, nil
 		}

 		period, err := readV1Number(c.periodFD)
 		if err != nil {
 			return 0, false, errMalformedFile
 		}

 		return float64(quota) / float64(period), true, nil
 	case 2:
 		// quotaFD is the cpu.max FD.
 		return readV2Limit(c.quotaFD)
 	default:
 		throw("impossible cgroup version")
 		panic("unreachable")
 	}
 }

 // Returns the value from the quota/period file.
 func readV1Number(fd int) (int64, error) {
 	// The format of the file is "<value>\n" where the value is in
 	// int64 microseconds and, if quota, may be -1 to indicate no limit.
 	//
 	// MaxInt64 requires 19 bytes to display in base 10, thus the
 	// conservative max size of this file is 19 + 1 (newline) = 20 bytes.
 	// We'll provide a bit more for good measure.
 	//
 	// Always read from the beginning of the file to get a fresh value.
 	var b [64]byte
 	n, errno := linux.Pread(fd, b[:], 0)
 	if errno != 0 {
 		return 0, errSyscallFailed
 	}
 	if n == len(b) {
 		return 0, errMalformedFile
 	}

 	buf := b[:n]
 	return parseV1Number(buf)
 }

 func parseV1Number(buf []byte) (int64, error) {
 	// Ignore trailing newline.
 	i := bytealg.IndexByte(buf, '\n')
 	if i < 0 {
 		return 0, errMalformedFile
 	}
 	buf = buf[:i]

 	val, ok := strconv.Atoi64(string(buf))
 	if !ok {
 		return 0, errMalformedFile
 	}

 	return val, nil
 }

 // Returns CPU throughput limit, or ok false if there is no limit.
 func readV2Limit(fd int) (float64, bool, error) {
 	// The format of the file is "<quota> <period>\n" where quota and
 	// period are microseconds and quota may be "max" to indicate no limit.
 	//
 	// Note that the kernel is inconsistent about whether the values are
 	// uint64 or int64: values are parsed as uint64 but printed as int64.
 	// See kernel/sched/core.c:cpu_max_{show,write}.
 	//
 	// In practice, the kernel limits the period to 1s (1000000us) (see
 	// max_cfs_quota_period), and the quota to (1<<44)us (see
 	// max_cfs_runtime), so these values can't get large enough for the
 	// distinction to matter.
 	//
 	// MaxInt64 requires 19 bytes to display in base 10, thus the
 	// conservative max size of this file is 19 + 19 + 1 (space) + 1
 	// (newline) = 40 bytes. We'll provide a bit more for good measure.
 	//
 	// Always read from the beginning of the file to get a fresh value.
 	var b [64]byte
 	n, errno := linux.Pread(fd, b[:], 0)
 	if errno != 0 {
 		return 0, false, errSyscallFailed
 	}
 	if n == len(b) {
 		return 0, false, errMalformedFile
 	}

 	buf := b[:n]
 	return parseV2Limit(buf)
 }

 func parseV2Limit(buf []byte) (float64, bool, error) {
 	i := bytealg.IndexByte(buf, ' ')
 	if i < 0 {
 		return 0, false, errMalformedFile
 	}

 	quotaStr := buf[:i]
 	if bytealg.Compare(quotaStr, []byte("max")) == 0 {
 		// No limit.
 		return 0, false, nil
 	}

 	periodStr := buf[i+1:]
 	// Ignore trailing newline, if any.
 	i = bytealg.IndexByte(periodStr, '\n')
 	if i < 0 {
 		return 0, false, errMalformedFile
 	}
 	periodStr = periodStr[:i]

 	quota, ok := strconv.Atoi64(string(quotaStr))
 	if !ok {
 		return 0, false, errMalformedFile
 	}

 	period, ok := strconv.Atoi64(string(periodStr))
 	if !ok {
 		return 0, false, errMalformedFile
 	}

 	return float64(quota) / float64(period), true, nil
 }

 // FindCPU finds the path to the CPU cgroup that this process is a member of
 // and places it in out. scratch is a scratch buffer for internal use.
 //
 // out must have length PathSize. scratch must have length ParseSize.
 //
 // Returns the number of bytes written to out and the cgroup version (1 or 2).
 //
 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
 func FindCPU(out []byte, scratch []byte) (int, Version, error) {
 	checkBufferSize(out, PathSize)
 	checkBufferSize(scratch, ParseSize)

 	// The cgroup path is <cgroup mount point> + <relative path>.
 	//
 	// This is racy if our cgroup is changed while this runs. For example,
 	// initially there is only a cgroup v2 mount and we are not in a
 	// cgroup. After, there a cgroup v1 mount with a CPU controller and we
 	// are placed in a cgroup in this hierarchy. In that case, findCPUMount
 	// could pick the v2 mount, and findCPURelativePath could find the v2
 	// relative path.
 	//
 	// In this case we'll later fail to read the cgroup files and fall back
 	// to assuming no cgroup.

 	n, err := FindCPUMountPoint(out, scratch)
 	if err != nil {
 		return 0, 0, err
 	}

 	// The relative path always starts with /, so we can directly append it
 	// to the mount point.
 	n2, version, err := FindCPURelativePath(out[n:], scratch)
 	if err != nil {
 		return 0, 0, err
 	}
 	n += n2

 	return n, version, nil
 }

 // FindCPURelativePath finds the path to the CPU cgroup that this process is a member of
 // relative to the root of the cgroup mount and places it in out. scratch is a
 // scratch buffer for internal use.
 //
 // out must have length PathSize minus the size of the cgroup mount root (if
 // known). scratch must have length ParseSize.
 //
 // Returns the number of bytes written to out and the cgroup version (1 or 2).
 //
 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
 func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {
 	path := []byte("/proc/self/cgroup\x00")
 	fd, errno := linux.Open(&path[0], linux.O_RDONLY|linux.O_CLOEXEC, 0)
 	if errno == linux.ENOENT {
 		return 0, 0, ErrNoCgroup
 	} else if errno != 0 {
 		return 0, 0, errSyscallFailed
 	}

 	// The relative path always starts with /, so we can directly append it
 	// to the mount point.
 	n, version, err := parseCPURelativePath(fd, linux.Read, out[:], scratch)
 	if err != nil {
 		linux.Close(fd)
 		return 0, 0, err
 	}

 	linux.Close(fd)
 	return n, version, nil
 }

 // Finds the path of the current process's CPU cgroup relative to the cgroup
 // mount and writes it to out.
 //
 // Returns the number of bytes written and the cgroup version (1 or 2).
 func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
 	// The format of each line is
 	//
 	//   hierarchy-ID:controller-list:cgroup-path
 	//
 	// controller-list is comma-separated.
 	// See man 5 cgroup for more details.
 	//
 	// cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
 	// is the CPU controller. Otherwise the v2 hierarchy (if any) is the
 	// CPU controller.
 	//
 	// hierarchy-ID and controller-list have relatively small maximum
 	// sizes, and the path can be up to _PATH_MAX, so we need a bit more
 	// than 1 _PATH_MAX of scratch space.

 	l := newLineReader(fd, scratch, read)

 	// Bytes written to out.
 	n := 0

 	for {
 		err := l.next()
 		if err == errIncompleteLine {
 			// Don't allow incomplete lines. While in theory the
 			// incomplete line may be for a controller we don't
 			// care about, in practice all lines should be of
 			// similar length, so we should just have a buffer big
 			// enough for any.
 			return 0, 0, err
 		} else if err == errEOF {
 			break
 		} else if err != nil {
 			return 0, 0, err
 		}

 		line := l.line()

 		// The format of each line is
 		//
 		//   hierarchy-ID:controller-list:cgroup-path
 		//
 		// controller-list is comma-separated.
 		// See man 5 cgroup for more details.
 		i := bytealg.IndexByte(line, ':')
 		if i < 0 {
 			return 0, 0, errMalformedFile
 		}

 		hierarchy := line[:i]
 		line = line[i+1:]

 		i = bytealg.IndexByte(line, ':')
 		if i < 0 {
 			return 0, 0, errMalformedFile
 		}

 		controllers := line[:i]
 		line = line[i+1:]

 		path := line

 		if string(hierarchy) == "0" {
 			// v2 hierarchy.
 			n = copy(out, path)
 			// Keep searching, we might find a v1 hierarchy with a
 			// CPU controller, which takes precedence.
 		} else {
 			// v1 hierarchy
 			if containsCPU(controllers) {
 				// Found a v1 CPU controller. This must be the
 				// only one, so we're done.
 				return copy(out, path), V1, nil
 			}
 		}
 	}

 	if n == 0 {
 		// Found nothing.
 		return 0, 0, ErrNoCgroup
 	}

 	// Must be v2, v1 returns above.
 	return n, V2, nil
 }

 // Returns true if comma-separated list b contains "cpu".
 func containsCPU(b []byte) bool {
 	for len(b) > 0 {
 		i := bytealg.IndexByte(b, ',')
 		if i < 0 {
 			// Neither cmd/compile nor gccgo allocates for these string conversions.
 			return string(b) == "cpu"
 		}

 		curr := b[:i]
 		rest := b[i+1:]

 		if string(curr) == "cpu" {
 			return true
 		}

 		b = rest
 	}

 	return false
 }

 // FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.
 // scratch is a scratch buffer for internal use.
 //
 // out must have length PathSize. scratch must have length ParseSize.
 //
 // Returns the number of bytes written to out.
 //
 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
 func FindCPUMountPoint(out []byte, scratch []byte) (int, error) {
 	checkBufferSize(out, PathSize)
 	checkBufferSize(scratch, ParseSize)

 	path := []byte("/proc/self/mountinfo\x00")
 	fd, errno := linux.Open(&path[0], linux.O_RDONLY|linux.O_CLOEXEC, 0)
 	if errno == linux.ENOENT {
 		return 0, ErrNoCgroup
 	} else if errno != 0 {
 		return 0, errSyscallFailed
 	}

 	n, err := parseCPUMount(fd, linux.Read, out, scratch)
 	if err != nil {
 		linux.Close(fd)
 		return 0, err
 	}
 	linux.Close(fd)

 	return n, nil
 }

 // Returns the mount point for the cpu cgroup controller (v1 or v2) from
 // /proc/self/mountinfo.
 func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, error) {
 	// The format of each line is:
 	//
 	// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 	// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
 	//
 	// (1) mount ID:  unique identifier of the mount (may be reused after umount)
 	// (2) parent ID:  ID of parent (or of self for the top of the mount tree)
 	// (3) major:minor:  value of st_dev for files on filesystem
 	// (4) root:  root of the mount within the filesystem
 	// (5) mount point:  mount point relative to the process's root
 	// (6) mount options:  per mount options
 	// (7) optional fields:  zero or more fields of the form "tag[:value]"
 	// (8) separator:  marks the end of the optional fields
 	// (9) filesystem type:  name of filesystem of the form "type[.subtype]"
 	// (10) mount source:  filesystem specific information or "none"
 	// (11) super options:  per super block options
 	//
 	// See man 5 proc_pid_mountinfo for more details.
 	//
 	// Note that emitted paths will not contain space, tab, newline, or
 	// carriage return. Those are escaped. See Linux show_mountinfo ->
 	// show_path. We must unescape before returning.
 	//
 	// We return the mount point (5) if the filesystem type (9) is cgroup2,
 	// or cgroup with "cpu" in the super options (11).
 	//
 	// (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
 	// small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
 	// Note that non-cgroup mounts may have arbitrarily long (11), but we
 	// can skip those when parsing.

 	l := newLineReader(fd, scratch, read)

 	// Bytes written to out.
 	n := 0

 	for {
 		//incomplete := false
 		err := l.next()
 		if err == errIncompleteLine {
 			// An incomplete line is fine as long as it doesn't
 			// impede parsing the fields we need. It shouldn't be
 			// possible for any mount to use more than 3*PATH_MAX
 			// before (9) because there are two paths and all other
 			// earlier fields have bounded options. Only (11) has
 			// unbounded options.
 		} else if err == errEOF {
 			break
 		} else if err != nil {
 			return 0, err
 		}

 		line := l.line()

 		// Skip first four fields.
 		for range 4 {
 			i := bytealg.IndexByte(line, ' ')
 			if i < 0 {
 				return 0, errMalformedFile
 			}
 			line = line[i+1:]
 		}

 		// (5) mount point:  mount point relative to the process's root
 		i := bytealg.IndexByte(line, ' ')
 		if i < 0 {
 			return 0, errMalformedFile
 		}
 		mnt := line[:i]
 		line = line[i+1:]

 		// Skip ahead past optional fields, delimited by " - ".
 		for {
 			i = bytealg.IndexByte(line, ' ')
 			if i < 0 {
 				return 0, errMalformedFile
 			}
 			if i+3 >= len(line) {
 				return 0, errMalformedFile
 			}
 			delim := line[i : i+3]
 			if string(delim) == " - " {
 				line = line[i+3:]
 				break
 			}
 			line = line[i+1:]
 		}

 		// (9) filesystem type:  name of filesystem of the form "type[.subtype]"
 		i = bytealg.IndexByte(line, ' ')
 		if i < 0 {
 			return 0, errMalformedFile
 		}
 		ftype := line[:i]
 		line = line[i+1:]

 		if string(ftype) != "cgroup" && string(ftype) != "cgroup2" {
 			continue
 		}

 		// As in findCPUPath, cgroup v1 with a CPU controller takes
 		// precendence over cgroup v2.
 		if string(ftype) == "cgroup2" {
 			// v2 hierarchy.
 			n, err = unescapePath(out, mnt)
 			if err != nil {
 				// Don't keep searching on error. The kernel
 				// should never produce broken escaping.
 				return n, err
 			}
 			// Keep searching, we might find a v1 hierarchy with a
 			// CPU controller, which takes precedence.
 			continue
 		}

 		// (10) mount source:  filesystem specific information or "none"
 		i = bytealg.IndexByte(line, ' ')
 		if i < 0 {
 			return 0, errMalformedFile
 		}
 		// Don't care about mount source.
 		line = line[i+1:]

 		// (11) super options:  per super block options
 		superOpt := line

 		// v1 hierarchy
 		if containsCPU(superOpt) {
 			// Found a v1 CPU controller. This must be the
 			// only one, so we're done.
 			return unescapePath(out, mnt)
 		}
 	}

 	if n == 0 {
 		// Found nothing.
 		return 0, ErrNoCgroup
 	}

 	return n, nil
 }

 var errInvalidEscape error = stringError("invalid path escape sequence")

 // unescapePath copies in to out, unescaping escape sequences generated by
 // Linux's show_path.
 //
 // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
 // like '\040' for space.
 //
 // out must be at least as large as in.
 //
 // Returns the number of bytes written to out.
 //
 // Also see escapePath in cgroup_linux_test.go.
 func unescapePath(out []byte, in []byte) (int, error) {
 	// Not strictly necessary, but simplifies the implementation and will
 	// always hold in users.
 	if len(out) < len(in) {
 		throw("output too small")
 	}

 	var outi, ini int
 	for ini < len(in) {
 		c := in[ini]
 		if c != '\\' {
 			out[outi] = c
 			outi++
 			ini++
 			continue
 		}

 		// Start of escape sequence.

 		// Escape sequence is always 4 characters: one slash and three
 		// digits.
 		if ini+3 >= len(in) {
 			return outi, errInvalidEscape
 		}

 		var outc byte
 		for i := range 3 {
 			c := in[ini+1+i]
 			if c < '0' || c > '9' {
 				return outi, errInvalidEscape
 			}

 			outc *= 8
 			outc += c - '0'
 		}

 		out[outi] = outc
 		outi++

 		ini += 4
 	}

 	return outi, nil
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package cgroup

	import (
	"internal/bytealg"
	"internal/runtime/strconv"
	"internal/runtime/syscall/linux"
	)

	var (
	ErrNoCgroup error = stringError("not in a cgroup")

	errMalformedFile error = stringError("malformed file")
	)

	const _PATH_MAX = 4096

	const (
	// Required amount of scratch space for CPULimit.
	//
	// TODO(prattmic): This is shockingly large (~70KiB) due to the (very
	// unlikely) combination of extremely long paths consisting mostly
	// escaped characters. The scratch buffer ends up in .bss in package
	// runtime, so it doesn't contribute to binary size and generally won't
	// be faulted in, but it would still be nice to shrink this. A more
	// complex parser that did not need to keep entire lines in memory
	// could get away with much less. Alternatively, we could do a one-off
	// mmap allocation for this buffer, which is only mapped larger if we
	// actually need the extra space.
	ScratchSize = PathSize + ParseSize

	// Required space to store a path of the cgroup in the filesystem.
	PathSize = _PATH_MAX

	// /proc/self/mountinfo path escape sequences are 4 characters long, so
	// a path consisting entirely of escaped characters could be 4 times
	// larger.
	escapedPathMax = 4 * _PATH_MAX

	// Required space to parse /proc/self/mountinfo and /proc/self/cgroup.
	// See findCPUMount and findCPURelativePath.
	ParseSize = 4 * escapedPathMax
	)

	// Include explicit NUL to be sure we include it in the slice.
	const (
	v2MaxFile = "/cpu.max\x00"
	v1QuotaFile = "/cpu.cfs_quota_us\x00"
	v1PeriodFile = "/cpu.cfs_period_us\x00"
	)

	// Version indicates the cgroup version.
	type Version int

	const (
	VersionUnknown Version = iota
	V1
	V2
	)

	// CPU owns the FDs required to read the CPU limit from a cgroup.
	type CPU struct {
	version Version

	// For cgroup v1, this is cpu.cfs_quota_us.
	// For cgroup v2, this is cpu.max.
	quotaFD int

	// For cgroup v1, this is cpu.cfs_period_us.
	// For cgroup v2, this is unused.
	periodFD int
	}

	func (c CPU) Close() {
	switch c.version {
	case V1:
	linux.Close(c.quotaFD)
	linux.Close(c.periodFD)
	case V2:
	linux.Close(c.quotaFD)
	default:
	throw("impossible cgroup version")
	}
	}

	func checkBufferSize(s []byte, size int) {
	if len(s) != size {
	println("runtime: cgroup buffer length", len(s), "want", size)
	throw("runtime: cgroup invalid buffer length")
	}
	}

	// OpenCPU returns a CPU for the CPU cgroup containing the current process, or
	// ErrNoCgroup if the process is not in a CPU cgroup.
	//
	// scratch must have length ScratchSize.
	func OpenCPU(scratch []byte) (CPU, error) {
	checkBufferSize(scratch, ScratchSize)

	base := scratch[:PathSize]
	scratch2 := scratch[PathSize:]

	n, version, err := FindCPU(base, scratch2)
	if err != nil {
	return CPU{}, err
	}

	switch version {
	case 1:
	n2 := copy(base[n:], v1QuotaFile)
	path := base[:n+n2]
	quotaFD, errno := linux.Open(&path[0], linux.O_RDONLY\|linux.O_CLOEXEC, 0)
	if errno != 0 {
	// This may fail if this process was migrated out of
	// the cgroup found by FindCPU and that cgroup has been
	// deleted.
	return CPU{}, errSyscallFailed
	}

	n2 = copy(base[n:], v1PeriodFile)
	path = base[:n+n2]
	periodFD, errno := linux.Open(&path[0], linux.O_RDONLY\|linux.O_CLOEXEC, 0)
	if errno != 0 {
	// This may fail if this process was migrated out of
	// the cgroup found by FindCPU and that cgroup has been
	// deleted.
	return CPU{}, errSyscallFailed
	}

	c := CPU{
	version: 1,
	quotaFD: quotaFD,
	periodFD: periodFD,
	}
	return c, nil
	case 2:
	n2 := copy(base[n:], v2MaxFile)
	path := base[:n+n2]
	maxFD, errno := linux.Open(&path[0], linux.O_RDONLY\|linux.O_CLOEXEC, 0)
	if errno != 0 {
	// This may fail if this process was migrated out of
	// the cgroup found by FindCPU and that cgroup has been
	// deleted.
	return CPU{}, errSyscallFailed
	}

	c := CPU{
	version: 2,
	quotaFD: maxFD,
	periodFD: -1,
	}
	return c, nil
	default:
	throw("impossible cgroup version")
	panic("unreachable")
	}
	}

	// Returns average CPU throughput limit from the cgroup, or ok false if there
	// is no limit.
	func ReadCPULimit(c CPU) (float64, bool, error) {
	switch c.version {
	case 1:
	quota, err := readV1Number(c.quotaFD)
	if err != nil {
	return 0, false, errMalformedFile
	}

	if quota < 0 {
	// No limit.
	return 0, false, nil
	}

	period, err := readV1Number(c.periodFD)
	if err != nil {
	return 0, false, errMalformedFile
	}

	return float64(quota) / float64(period), true, nil
	case 2:
	// quotaFD is the cpu.max FD.
	return readV2Limit(c.quotaFD)
	default:
	throw("impossible cgroup version")
	panic("unreachable")
	}
	}

	// Returns the value from the quota/period file.
	func readV1Number(fd int) (int64, error) {
	// The format of the file is "<value>\n" where the value is in
	// int64 microseconds and, if quota, may be -1 to indicate no limit.
	//
	// MaxInt64 requires 19 bytes to display in base 10, thus the
	// conservative max size of this file is 19 + 1 (newline) = 20 bytes.
	// We'll provide a bit more for good measure.
	//
	// Always read from the beginning of the file to get a fresh value.
	var b [64]byte
	n, errno := linux.Pread(fd, b[:], 0)
	if errno != 0 {
	return 0, errSyscallFailed
	}
	if n == len(b) {
	return 0, errMalformedFile
	}

	buf := b[:n]
	return parseV1Number(buf)
	}

	func parseV1Number(buf []byte) (int64, error) {
	// Ignore trailing newline.
	i := bytealg.IndexByte(buf, '\n')
	if i < 0 {
	return 0, errMalformedFile
	}
	buf = buf[:i]

	val, ok := strconv.Atoi64(string(buf))
	if !ok {
	return 0, errMalformedFile
	}

	return val, nil
	}

	// Returns CPU throughput limit, or ok false if there is no limit.
	func readV2Limit(fd int) (float64, bool, error) {
	// The format of the file is "<quota> <period>\n" where quota and
	// period are microseconds and quota may be "max" to indicate no limit.
	//
	// Note that the kernel is inconsistent about whether the values are
	// uint64 or int64: values are parsed as uint64 but printed as int64.
	// See kernel/sched/core.c:cpu_max_{show,write}.
	//
	// In practice, the kernel limits the period to 1s (1000000us) (see
	// max_cfs_quota_period), and the quota to (1<<44)us (see
	// max_cfs_runtime), so these values can't get large enough for the
	// distinction to matter.
	//
	// MaxInt64 requires 19 bytes to display in base 10, thus the
	// conservative max size of this file is 19 + 19 + 1 (space) + 1
	// (newline) = 40 bytes. We'll provide a bit more for good measure.
	//
	// Always read from the beginning of the file to get a fresh value.
	var b [64]byte
	n, errno := linux.Pread(fd, b[:], 0)
	if errno != 0 {
	return 0, false, errSyscallFailed
	}
	if n == len(b) {
	return 0, false, errMalformedFile
	}

	buf := b[:n]
	return parseV2Limit(buf)
	}

	func parseV2Limit(buf []byte) (float64, bool, error) {
	i := bytealg.IndexByte(buf, ' ')
	if i < 0 {
	return 0, false, errMalformedFile
	}

	quotaStr := buf[:i]
	if bytealg.Compare(quotaStr, []byte("max")) == 0 {
	// No limit.
	return 0, false, nil
	}

	periodStr := buf[i+1:]
	// Ignore trailing newline, if any.
	i = bytealg.IndexByte(periodStr, '\n')
	if i < 0 {
	return 0, false, errMalformedFile
	}
	periodStr = periodStr[:i]

	quota, ok := strconv.Atoi64(string(quotaStr))
	if !ok {
	return 0, false, errMalformedFile
	}

	period, ok := strconv.Atoi64(string(periodStr))
	if !ok {
	return 0, false, errMalformedFile
	}

	return float64(quota) / float64(period), true, nil
	}

	// FindCPU finds the path to the CPU cgroup that this process is a member of
	// and places it in out. scratch is a scratch buffer for internal use.
	//
	// out must have length PathSize. scratch must have length ParseSize.
	//
	// Returns the number of bytes written to out and the cgroup version (1 or 2).
	//
	// Returns ErrNoCgroup if the process is not in a CPU cgroup.
	func FindCPU(out []byte, scratch []byte) (int, Version, error) {
	checkBufferSize(out, PathSize)
	checkBufferSize(scratch, ParseSize)

	// The cgroup path is <cgroup mount point> + <relative path>.
	//
	// This is racy if our cgroup is changed while this runs. For example,
	// initially there is only a cgroup v2 mount and we are not in a
	// cgroup. After, there a cgroup v1 mount with a CPU controller and we
	// are placed in a cgroup in this hierarchy. In that case, findCPUMount
	// could pick the v2 mount, and findCPURelativePath could find the v2
	// relative path.
	//
	// In this case we'll later fail to read the cgroup files and fall back
	// to assuming no cgroup.

	n, err := FindCPUMountPoint(out, scratch)
	if err != nil {
	return 0, 0, err
	}

	// The relative path always starts with /, so we can directly append it
	// to the mount point.
	n2, version, err := FindCPURelativePath(out[n:], scratch)
	if err != nil {
	return 0, 0, err
	}
	n += n2

	return n, version, nil
	}

	// FindCPURelativePath finds the path to the CPU cgroup that this process is a member of
	// relative to the root of the cgroup mount and places it in out. scratch is a
	// scratch buffer for internal use.
	//
	// out must have length PathSize minus the size of the cgroup mount root (if
	// known). scratch must have length ParseSize.
	//
	// Returns the number of bytes written to out and the cgroup version (1 or 2).
	//
	// Returns ErrNoCgroup if the process is not in a CPU cgroup.
	func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {
	path := []byte("/proc/self/cgroup\x00")
	fd, errno := linux.Open(&path[0], linux.O_RDONLY\|linux.O_CLOEXEC, 0)
	if errno == linux.ENOENT {
	return 0, 0, ErrNoCgroup
	} else if errno != 0 {
	return 0, 0, errSyscallFailed
	}

	// The relative path always starts with /, so we can directly append it
	// to the mount point.
	n, version, err := parseCPURelativePath(fd, linux.Read, out[:], scratch)
	if err != nil {
	linux.Close(fd)
	return 0, 0, err
	}

	linux.Close(fd)
	return n, version, nil
	}

	// Finds the path of the current process's CPU cgroup relative to the cgroup
	// mount and writes it to out.
	//
	// Returns the number of bytes written and the cgroup version (1 or 2).
	func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
	// The format of each line is
	//
	// hierarchy-ID:controller-list:cgroup-path
	//
	// controller-list is comma-separated.
	// See man 5 cgroup for more details.
	//
	// cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
	// is the CPU controller. Otherwise the v2 hierarchy (if any) is the
	// CPU controller.
	//
	// hierarchy-ID and controller-list have relatively small maximum
	// sizes, and the path can be up to _PATH_MAX, so we need a bit more
	// than 1 _PATH_MAX of scratch space.

	l := newLineReader(fd, scratch, read)

	// Bytes written to out.
	n := 0

	for {
	err := l.next()
	if err == errIncompleteLine {
	// Don't allow incomplete lines. While in theory the
	// incomplete line may be for a controller we don't
	// care about, in practice all lines should be of
	// similar length, so we should just have a buffer big
	// enough for any.
	return 0, 0, err
	} else if err == errEOF {
	break
	} else if err != nil {
	return 0, 0, err
	}

	line := l.line()

	// The format of each line is
	//
	// hierarchy-ID:controller-list:cgroup-path
	//
	// controller-list is comma-separated.
	// See man 5 cgroup for more details.
	i := bytealg.IndexByte(line, ':')
	if i < 0 {
	return 0, 0, errMalformedFile
	}

	hierarchy := line[:i]
	line = line[i+1:]

	i = bytealg.IndexByte(line, ':')
	if i < 0 {
	return 0, 0, errMalformedFile
	}

	controllers := line[:i]
	line = line[i+1:]

	path := line

	if string(hierarchy) == "0" {
	// v2 hierarchy.
	n = copy(out, path)
	// Keep searching, we might find a v1 hierarchy with a
	// CPU controller, which takes precedence.
	} else {
	// v1 hierarchy
	if containsCPU(controllers) {
	// Found a v1 CPU controller. This must be the
	// only one, so we're done.
	return copy(out, path), V1, nil
	}
	}
	}

	if n == 0 {
	// Found nothing.
	return 0, 0, ErrNoCgroup
	}

	// Must be v2, v1 returns above.
	return n, V2, nil
	}

	// Returns true if comma-separated list b contains "cpu".
	func containsCPU(b []byte) bool {
	for len(b) > 0 {
	i := bytealg.IndexByte(b, ',')
	if i < 0 {
	// Neither cmd/compile nor gccgo allocates for these string conversions.
	return string(b) == "cpu"
	}

	curr := b[:i]
	rest := b[i+1:]

	if string(curr) == "cpu" {
	return true
	}

	b = rest
	}

	return false
	}

	// FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.
	// scratch is a scratch buffer for internal use.
	//
	// out must have length PathSize. scratch must have length ParseSize.
	//
	// Returns the number of bytes written to out.
	//
	// Returns ErrNoCgroup if the process is not in a CPU cgroup.
	func FindCPUMountPoint(out []byte, scratch []byte) (int, error) {
	checkBufferSize(out, PathSize)
	checkBufferSize(scratch, ParseSize)

	path := []byte("/proc/self/mountinfo\x00")
	fd, errno := linux.Open(&path[0], linux.O_RDONLY\|linux.O_CLOEXEC, 0)
	if errno == linux.ENOENT {
	return 0, ErrNoCgroup
	} else if errno != 0 {
	return 0, errSyscallFailed
	}

	n, err := parseCPUMount(fd, linux.Read, out, scratch)
	if err != nil {
	linux.Close(fd)
	return 0, err
	}
	linux.Close(fd)

	return n, nil
	}

	// Returns the mount point for the cpu cgroup controller (v1 or v2) from
	// /proc/self/mountinfo.
	func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, error) {
	// The format of each line is:
	//
	// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
	// (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
	//
	// (1) mount ID: unique identifier of the mount (may be reused after umount)
	// (2) parent ID: ID of parent (or of self for the top of the mount tree)
	// (3) major:minor: value of st_dev for files on filesystem
	// (4) root: root of the mount within the filesystem
	// (5) mount point: mount point relative to the process's root
	// (6) mount options: per mount options
	// (7) optional fields: zero or more fields of the form "tag[:value]"
	// (8) separator: marks the end of the optional fields
	// (9) filesystem type: name of filesystem of the form "type[.subtype]"
	// (10) mount source: filesystem specific information or "none"
	// (11) super options: per super block options
	//
	// See man 5 proc_pid_mountinfo for more details.
	//
	// Note that emitted paths will not contain space, tab, newline, or
	// carriage return. Those are escaped. See Linux show_mountinfo ->
	// show_path. We must unescape before returning.
	//
	// We return the mount point (5) if the filesystem type (9) is cgroup2,
	// or cgroup with "cpu" in the super options (11).
	//
	// (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
	// small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
	// Note that non-cgroup mounts may have arbitrarily long (11), but we
	// can skip those when parsing.

	l := newLineReader(fd, scratch, read)

	// Bytes written to out.
	n := 0

	for {
	//incomplete := false
	err := l.next()
	if err == errIncompleteLine {
	// An incomplete line is fine as long as it doesn't
	// impede parsing the fields we need. It shouldn't be
	// possible for any mount to use more than 3*PATH_MAX
	// before (9) because there are two paths and all other
	// earlier fields have bounded options. Only (11) has
	// unbounded options.
	} else if err == errEOF {
	break
	} else if err != nil {
	return 0, err
	}

	line := l.line()

	// Skip first four fields.
	for range 4 {
	i := bytealg.IndexByte(line, ' ')
	if i < 0 {
	return 0, errMalformedFile
	}
	line = line[i+1:]
	}

	// (5) mount point: mount point relative to the process's root
	i := bytealg.IndexByte(line, ' ')
	if i < 0 {
	return 0, errMalformedFile
	}
	mnt := line[:i]
	line = line[i+1:]

	// Skip ahead past optional fields, delimited by " - ".
	for {
	i = bytealg.IndexByte(line, ' ')
	if i < 0 {
	return 0, errMalformedFile
	}
	if i+3 >= len(line) {
	return 0, errMalformedFile
	}
	delim := line[i : i+3]
	if string(delim) == " - " {
	line = line[i+3:]
	break
	}
	line = line[i+1:]
	}

	// (9) filesystem type: name of filesystem of the form "type[.subtype]"
	i = bytealg.IndexByte(line, ' ')
	if i < 0 {
	return 0, errMalformedFile
	}
	ftype := line[:i]
	line = line[i+1:]

	if string(ftype) != "cgroup" && string(ftype) != "cgroup2" {
	continue
	}

	// As in findCPUPath, cgroup v1 with a CPU controller takes
	// precendence over cgroup v2.
	if string(ftype) == "cgroup2" {
	// v2 hierarchy.
	n, err = unescapePath(out, mnt)
	if err != nil {
	// Don't keep searching on error. The kernel
	// should never produce broken escaping.
	return n, err
	}
	// Keep searching, we might find a v1 hierarchy with a
	// CPU controller, which takes precedence.
	continue
	}

	// (10) mount source: filesystem specific information or "none"
	i = bytealg.IndexByte(line, ' ')
	if i < 0 {
	return 0, errMalformedFile
	}
	// Don't care about mount source.
	line = line[i+1:]

	// (11) super options: per super block options
	superOpt := line

	// v1 hierarchy
	if containsCPU(superOpt) {
	// Found a v1 CPU controller. This must be the
	// only one, so we're done.
	return unescapePath(out, mnt)
	}
	}

	if n == 0 {
	// Found nothing.
	return 0, ErrNoCgroup
	}

	return n, nil
	}

	var errInvalidEscape error = stringError("invalid path escape sequence")

	// unescapePath copies in to out, unescaping escape sequences generated by
	// Linux's show_path.
	//
	// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
	// like '\040' for space.
	//
	// out must be at least as large as in.
	//
	// Returns the number of bytes written to out.
	//
	// Also see escapePath in cgroup_linux_test.go.
	func unescapePath(out []byte, in []byte) (int, error) {
	// Not strictly necessary, but simplifies the implementation and will
	// always hold in users.
	if len(out) < len(in) {
	throw("output too small")
	}

	var outi, ini int
	for ini < len(in) {
	c := in[ini]
	if c != '\\' {
	out[outi] = c
	outi++
	ini++
	continue
	}

	// Start of escape sequence.

	// Escape sequence is always 4 characters: one slash and three
	// digits.
	if ini+3 >= len(in) {
	return outi, errInvalidEscape
	}

	var outc byte
	for i := range 3 {
	c := in[ini+1+i]
	if c < '0' \|\| c > '9' {
	return outi, errInvalidEscape
	}

	outc *= 8
	outc += c - '0'
	}

	out[outi] = outc
	outi++

	ini += 4
	}

	return outi, nil
	}