| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package runtime |
| |
| import ( |
| "runtime/internal/atomic" |
| "runtime/internal/sys" |
| "unsafe" |
| ) |
| |
| const ( |
| _EACCES = 13 |
| _EINVAL = 22 |
| ) |
| |
| // Don't split the stack as this method may be invoked without a valid G, which |
| // prevents us from allocating more stack. |
| //go:nosplit |
| func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer { |
| p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0) |
| if err != 0 { |
| if err == _EACCES { |
| print("runtime: mmap: access denied\n") |
| exit(2) |
| } |
| if err == _EAGAIN { |
| print("runtime: mmap: too much locked memory (check 'ulimit -l').\n") |
| exit(2) |
| } |
| return nil |
| } |
| mSysStatInc(sysStat, n) |
| return p |
| } |
| |
| var adviseUnused = uint32(_MADV_FREE) |
| |
| func sysUnused(v unsafe.Pointer, n uintptr) { |
| // By default, Linux's "transparent huge page" support will |
| // merge pages into a huge page if there's even a single |
| // present regular page, undoing the effects of madvise(adviseUnused) |
| // below. On amd64, that means khugepaged can turn a single |
| // 4KB page to 2MB, bloating the process's RSS by as much as |
| // 512X. (See issue #8832 and Linux kernel bug |
| // https://bugzilla.kernel.org/show_bug.cgi?id=93111) |
| // |
| // To work around this, we explicitly disable transparent huge |
| // pages when we release pages of the heap. However, we have |
| // to do this carefully because changing this flag tends to |
| // split the VMA (memory mapping) containing v in to three |
| // VMAs in order to track the different values of the |
| // MADV_NOHUGEPAGE flag in the different regions. There's a |
| // default limit of 65530 VMAs per address space (sysctl |
| // vm.max_map_count), so we must be careful not to create too |
| // many VMAs (see issue #12233). |
| // |
| // Since huge pages are huge, there's little use in adjusting |
| // the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid |
| // exploding the number of VMAs by only adjusting the |
| // MADV_NOHUGEPAGE flag on a large granularity. This still |
| // gets most of the benefit of huge pages while keeping the |
| // number of VMAs under control. With hugePageSize = 2MB, even |
| // a pessimal heap can reach 128GB before running out of VMAs. |
| if sys.HugePageSize != 0 { |
| var s uintptr = sys.HugePageSize // division by constant 0 is a compile-time error :( |
| |
| // If it's a large allocation, we want to leave huge |
| // pages enabled. Hence, we only adjust the huge page |
| // flag on the huge pages containing v and v+n-1, and |
| // only if those aren't aligned. |
| var head, tail uintptr |
| if uintptr(v)%s != 0 { |
| // Compute huge page containing v. |
| head = uintptr(v) &^ (s - 1) |
| } |
| if (uintptr(v)+n)%s != 0 { |
| // Compute huge page containing v+n-1. |
| tail = (uintptr(v) + n - 1) &^ (s - 1) |
| } |
| |
| // Note that madvise will return EINVAL if the flag is |
| // already set, which is quite likely. We ignore |
| // errors. |
| if head != 0 && head+sys.HugePageSize == tail { |
| // head and tail are different but adjacent, |
| // so do this in one call. |
| madvise(unsafe.Pointer(head), 2*sys.HugePageSize, _MADV_NOHUGEPAGE) |
| } else { |
| // Advise the huge pages containing v and v+n-1. |
| if head != 0 { |
| madvise(unsafe.Pointer(head), sys.HugePageSize, _MADV_NOHUGEPAGE) |
| } |
| if tail != 0 && tail != head { |
| madvise(unsafe.Pointer(tail), sys.HugePageSize, _MADV_NOHUGEPAGE) |
| } |
| } |
| } |
| |
| if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 { |
| // madvise will round this to any physical page |
| // *covered* by this range, so an unaligned madvise |
| // will release more memory than intended. |
| throw("unaligned sysUnused") |
| } |
| |
| var advise uint32 |
| if debug.madvdontneed != 0 { |
| advise = _MADV_DONTNEED |
| } else { |
| advise = atomic.Load(&adviseUnused) |
| } |
| if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 { |
| // MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is |
| // not supported. |
| atomic.Store(&adviseUnused, _MADV_DONTNEED) |
| madvise(v, n, _MADV_DONTNEED) |
| } |
| } |
| |
| func sysUsed(v unsafe.Pointer, n uintptr) { |
| if sys.HugePageSize != 0 { |
| // Partially undo the NOHUGEPAGE marks from sysUnused |
| // for whole huge pages between v and v+n. This may |
| // leave huge pages off at the end points v and v+n |
| // even though allocations may cover these entire huge |
| // pages. We could detect this and undo NOHUGEPAGE on |
| // the end points as well, but it's probably not worth |
| // the cost because when neighboring allocations are |
| // freed sysUnused will just set NOHUGEPAGE again. |
| var s uintptr = sys.HugePageSize |
| |
| // Round v up to a huge page boundary. |
| beg := (uintptr(v) + (s - 1)) &^ (s - 1) |
| // Round v+n down to a huge page boundary. |
| end := (uintptr(v) + n) &^ (s - 1) |
| |
| if beg < end { |
| madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE) |
| } |
| } |
| } |
| |
| // Don't split the stack as this function may be invoked without a valid G, |
| // which prevents us from allocating more stack. |
| //go:nosplit |
| func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) { |
| mSysStatDec(sysStat, n) |
| munmap(v, n) |
| } |
| |
| func sysFault(v unsafe.Pointer, n uintptr) { |
| mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0) |
| } |
| |
| func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer { |
| p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0) |
| if err != 0 { |
| return nil |
| } |
| return p |
| } |
| |
| func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) { |
| mSysStatInc(sysStat, n) |
| |
| p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0) |
| if err == _ENOMEM { |
| throw("runtime: out of memory") |
| } |
| if p != v || err != 0 { |
| throw("runtime: cannot map pages in arena address space") |
| } |
| } |