synchronization

This commit is contained in:
2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2018, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,634 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
package nvml
// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
// #include "nvml_dl.h"
import "C"
import (
"errors"
"fmt"
"io/ioutil"
"os"
"sort"
"strconv"
"strings"
)
const (
szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
szProcs = 32
szProcName = 64
XidCriticalError = C.nvmlEventTypeXidCriticalError
)
type handle struct{ dev C.nvmlDevice_t }
type EventSet struct{ set C.nvmlEventSet_t }
type Event struct {
UUID *string
Etype uint64
Edata uint64
}
func uintPtr(c C.uint) *uint {
i := uint(c)
return &i
}
func uint64Ptr(c C.ulonglong) *uint64 {
i := uint64(c)
return &i
}
func stringPtr(c *C.char) *string {
s := C.GoString(c)
return &s
}
func errorString(ret C.nvmlReturn_t) error {
if ret == C.NVML_SUCCESS {
return nil
}
err := C.GoString(C.nvmlErrorString(ret))
return fmt.Errorf("nvml: %v", err)
}
func init_() error {
r := C.nvmlInit_dl()
if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
return errors.New("could not load NVML library")
}
return errorString(r)
}
func NewEventSet() EventSet {
var set C.nvmlEventSet_t
C.nvmlEventSetCreate(&set)
return EventSet{set}
}
func RegisterEvent(es EventSet, event int) error {
n, err := deviceGetCount()
if err != nil {
return err
}
var i uint
for i = 0; i < n; i++ {
h, err := deviceGetHandleByIndex(i)
if err != nil {
return err
}
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
if r != C.NVML_SUCCESS {
return errorString(r)
}
}
return nil
}
func RegisterEventForDevice(es EventSet, event int, uuid string) error {
n, err := deviceGetCount()
if err != nil {
return err
}
var i uint
for i = 0; i < n; i++ {
h, err := deviceGetHandleByIndex(i)
if err != nil {
return err
}
duuid, err := h.deviceGetUUID()
if err != nil {
return err
}
if *duuid != uuid {
continue
}
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
if r != C.NVML_SUCCESS {
return errorString(r)
}
return nil
}
return fmt.Errorf("nvml: device not found")
}
func DeleteEventSet(es EventSet) {
C.nvmlEventSetFree(es.set)
}
func WaitForEvent(es EventSet, timeout uint) (Event, error) {
var data C.nvmlEventData_t
r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
uuid, _ := handle{data.device}.deviceGetUUID()
return Event{
UUID: uuid,
Etype: uint64(data.eventType),
Edata: uint64(data.eventData),
},
errorString(r)
}
func shutdown() error {
return errorString(C.nvmlShutdown_dl())
}
func systemGetDriverVersion() (string, error) {
var driver [szDriver]C.char
r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
return C.GoString(&driver[0]), errorString(r)
}
func systemGetProcessName(pid uint) (string, error) {
var proc [szProcName]C.char
r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
return C.GoString(&proc[0]), errorString(r)
}
func deviceGetCount() (uint, error) {
var n C.uint
r := C.nvmlDeviceGetCount(&n)
return uint(n), errorString(r)
}
func deviceGetHandleByIndex(idx uint) (handle, error) {
var dev C.nvmlDevice_t
r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
return handle{dev}, errorString(r)
}
func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
var level C.nvmlGpuTopologyLevel_t
r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(C.uint(level)), errorString(r)
}
func (h handle) deviceGetName() (*string, error) {
var name [szName]C.char
r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return stringPtr(&name[0]), errorString(r)
}
func (h handle) deviceGetUUID() (*string, error) {
var uuid [szUUID]C.char
r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return stringPtr(&uuid[0]), errorString(r)
}
func (h handle) deviceGetPciInfo() (*string, error) {
var pci C.nvmlPciInfo_t
r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return stringPtr(&pci.busId[0]), errorString(r)
}
func (h handle) deviceGetMinorNumber() (*uint, error) {
var minor C.uint
r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(minor), errorString(r)
}
func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
var bar1 C.nvmlBAR1Memory_t
r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
}
func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
var power C.uint
r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(power), errorString(r)
}
func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
var sm, mem C.uint
r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
}
return uintPtr(sm), uintPtr(mem), errorString(r)
}
func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
var link C.uint
r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(link), errorString(r)
}
func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
var width C.uint
r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(width), errorString(r)
}
func (h handle) deviceGetPowerUsage() (*uint, error) {
var power C.uint
r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(power), errorString(r)
}
func (h handle) deviceGetTemperature() (*uint, error) {
var temp C.uint
r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(temp), errorString(r)
}
func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
var usage C.nvmlUtilization_t
r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
}
func (h handle) deviceGetEncoderUtilization() (*uint, error) {
var usage, sampling C.uint
r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(usage), errorString(r)
}
func (h handle) deviceGetDecoderUtilization() (*uint, error) {
var usage, sampling C.uint
r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(usage), errorString(r)
}
func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
var mem C.nvmlMemory_t
r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
err = errorString(r)
if r != C.NVML_SUCCESS {
return
}
totalMem = uint64Ptr(mem.total)
if totalMem != nil {
*totalMem /= 1024 * 1024 // MiB
}
devMem = DeviceMemory{
Used: uint64Ptr(mem.used),
Free: uint64Ptr(mem.free),
}
if devMem.Used != nil {
*devMem.Used /= 1024 * 1024 // MiB
}
if devMem.Free != nil {
*devMem.Free /= 1024 * 1024 // MiB
}
return
}
func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
var sm, mem C.uint
r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
}
return uintPtr(sm), uintPtr(mem), errorString(r)
}
func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
var l1, l2, mem C.ulonglong
r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
}
return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
}
func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
var rx, tx C.uint
r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
}
return uintPtr(rx), uintPtr(tx), errorString(r)
}
func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
var procs [szProcs]C.nvmlProcessInfo_t
var count = C.uint(szProcs)
r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
n := int(count)
pids := make([]uint, n)
mems := make([]uint64, n)
for i := 0; i < n; i++ {
pids[i] = uint(procs[i].pid)
mems[i] = uint64(procs[i].usedGpuMemory)
}
return pids, mems, errorString(r)
}
func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
var procs [szProcs]C.nvmlProcessInfo_t
var count = C.uint(szProcs)
r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
n := int(count)
pids := make([]uint, n)
mems := make([]uint64, n)
for i := 0; i < n; i++ {
pids[i] = uint(procs[i].pid)
mems[i] = uint64(procs[i].usedGpuMemory)
}
return pids, mems, errorString(r)
}
func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
if err != nil {
return nil, err
}
gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
if err != nil {
return nil, err
}
allPids := make(map[uint]ProcessInfo)
for i, pid := range cPids {
name, err := processName(pid)
if err != nil {
return nil, err
}
allPids[pid] = ProcessInfo{
PID: pid,
Name: name,
MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
Type: Compute,
}
}
for i, pid := range gPids {
pInfo, exists := allPids[pid]
if exists {
pInfo.Type = ComputeAndGraphics
allPids[pid] = pInfo
} else {
name, err := processName(pid)
if err != nil {
return nil, err
}
allPids[pid] = ProcessInfo{
PID: pid,
Name: name,
MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
Type: Graphics,
}
}
}
var processInfo []ProcessInfo
for _, v := range allPids {
processInfo = append(processInfo, v)
}
sort.Slice(processInfo, func(i, j int) bool {
return processInfo[i].PID < processInfo[j].PID
})
return processInfo, nil
}
func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
var clocksThrottleReasons C.ulonglong
r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return ThrottleReasonUnknown, nil
}
if r != C.NVML_SUCCESS {
return ThrottleReasonUnknown, errorString(r)
}
switch clocksThrottleReasons {
case C.nvmlClocksThrottleReasonGpuIdle:
reason = ThrottleReasonGpuIdle
case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
reason = ThrottleReasonApplicationsClocksSetting
case C.nvmlClocksThrottleReasonSwPowerCap:
reason = ThrottleReasonSwPowerCap
case C.nvmlClocksThrottleReasonHwSlowdown:
reason = ThrottleReasonHwSlowdown
case C.nvmlClocksThrottleReasonSyncBoost:
reason = ThrottleReasonSyncBoost
case C.nvmlClocksThrottleReasonSwThermalSlowdown:
reason = ThrottleReasonSwThermalSlowdown
case C.nvmlClocksThrottleReasonHwThermalSlowdown:
reason = ThrottleReasonHwThermalSlowdown
case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
reason = ThrottleReasonHwPowerBrakeSlowdown
case C.nvmlClocksThrottleReasonDisplayClockSetting:
reason = ThrottleReasonDisplayClockSetting
case C.nvmlClocksThrottleReasonNone:
reason = ThrottleReasonNone
}
return
}
func (h handle) getPerformanceState() (PerfState, error) {
var pstate C.nvmlPstates_t
r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return PerfStateUnknown, nil
}
if r != C.NVML_SUCCESS {
return PerfStateUnknown, errorString(r)
}
return PerfState(pstate), nil
}
func processName(pid uint) (string, error) {
f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
d, err := ioutil.ReadFile(f)
if err != nil {
// TOCTOU: process terminated
if os.IsNotExist(err) {
return "", nil
}
return "", err
}
return strings.TrimSuffix(string(d), "\n"), err
}
func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
var mode C.nvmlEnableState_t
var buffer C.uint
r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return accountingInfo, errorString(r)
}
r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return accountingInfo, errorString(r)
}
accountingInfo = Accounting{
Mode: ModeState(mode),
BufferSize: uintPtr(buffer),
}
return
}
func (h handle) getDisplayInfo() (display Display, err error) {
var mode, isActive C.nvmlEnableState_t
r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return display, errorString(r)
}
r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return display, errorString(r)
}
display = Display{
Mode: ModeState(mode),
Active: ModeState(isActive),
}
return
}
func (h handle) getPeristenceMode() (state ModeState, err error) {
var mode C.nvmlEnableState_t
r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
return ModeState(mode), errorString(r)
}

View File

@@ -0,0 +1,533 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
package nvml
// #include "nvml_dl.h"
import "C"
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"strconv"
"strings"
)
var (
ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
ErrUnsupportedGPU = errors.New("unsupported GPU device")
)
type ModeState uint
const (
Enabled ModeState = iota
Disabled
)
func (m ModeState) String() string {
switch m {
case Enabled:
return "Enabled"
case Disabled:
return "Disabled"
}
return "N/A"
}
type Display struct {
Mode ModeState
Active ModeState
}
type Accounting struct {
Mode ModeState
BufferSize *uint
}
type DeviceMode struct {
DisplayInfo Display
Persistence ModeState
AccountingInfo Accounting
}
type ThrottleReason uint
const (
ThrottleReasonGpuIdle ThrottleReason = iota
ThrottleReasonApplicationsClocksSetting
ThrottleReasonSwPowerCap
ThrottleReasonHwSlowdown
ThrottleReasonSyncBoost
ThrottleReasonSwThermalSlowdown
ThrottleReasonHwThermalSlowdown
ThrottleReasonHwPowerBrakeSlowdown
ThrottleReasonDisplayClockSetting
ThrottleReasonNone
ThrottleReasonUnknown
)
func (r ThrottleReason) String() string {
switch r {
case ThrottleReasonGpuIdle:
return "Gpu Idle"
case ThrottleReasonApplicationsClocksSetting:
return "Applications Clocks Setting"
case ThrottleReasonSwPowerCap:
return "SW Power Cap"
case ThrottleReasonHwSlowdown:
return "HW Slowdown"
case ThrottleReasonSyncBoost:
return "Sync Boost"
case ThrottleReasonSwThermalSlowdown:
return "SW Thermal Slowdown"
case ThrottleReasonHwThermalSlowdown:
return "HW Thermal Slowdown"
case ThrottleReasonHwPowerBrakeSlowdown:
return "HW Power Brake Slowdown"
case ThrottleReasonDisplayClockSetting:
return "Display Clock Setting"
case ThrottleReasonNone:
return "No clocks throttling"
}
return "N/A"
}
type PerfState uint
const (
PerfStateMax = 0
PerfStateMin = 15
PerfStateUnknown = 32
)
func (p PerfState) String() string {
if p >= PerfStateMax && p <= PerfStateMin {
return fmt.Sprintf("P%d", p)
}
return "Unknown"
}
type ProcessType uint
const (
Compute ProcessType = iota
Graphics
ComputeAndGraphics
)
func (t ProcessType) String() string {
typ := "C+G"
if t == Compute {
typ = "C"
} else if t == Graphics {
typ = "G"
}
return typ
}
type P2PLinkType uint
const (
P2PLinkUnknown P2PLinkType = iota
P2PLinkCrossCPU
P2PLinkSameCPU
P2PLinkHostBridge
P2PLinkMultiSwitch
P2PLinkSingleSwitch
P2PLinkSameBoard
)
type P2PLink struct {
BusID string
Link P2PLinkType
}
func (t P2PLinkType) String() string {
switch t {
case P2PLinkCrossCPU:
return "Cross CPU socket"
case P2PLinkSameCPU:
return "Same CPU socket"
case P2PLinkHostBridge:
return "Host PCI bridge"
case P2PLinkMultiSwitch:
return "Multiple PCI switches"
case P2PLinkSingleSwitch:
return "Single PCI switch"
case P2PLinkSameBoard:
return "Same board"
case P2PLinkUnknown:
}
return "N/A"
}
type ClockInfo struct {
Cores *uint
Memory *uint
}
type PCIInfo struct {
BusID string
BAR1 *uint64
Bandwidth *uint
}
type Device struct {
handle
UUID string
Path string
Model *string
Power *uint
Memory *uint64
CPUAffinity *uint
PCI PCIInfo
Clocks ClockInfo
Topology []P2PLink
}
type UtilizationInfo struct {
GPU *uint
Memory *uint
Encoder *uint
Decoder *uint
}
type PCIThroughputInfo struct {
RX *uint
TX *uint
}
type PCIStatusInfo struct {
BAR1Used *uint64
Throughput PCIThroughputInfo
}
type ECCErrorsInfo struct {
L1Cache *uint64
L2Cache *uint64
Device *uint64
}
type DeviceMemory struct {
Used *uint64
Free *uint64
}
type MemoryInfo struct {
Global DeviceMemory
ECCErrors ECCErrorsInfo
}
type ProcessInfo struct {
PID uint
Name string
MemoryUsed uint64
Type ProcessType
}
type DeviceStatus struct {
Power *uint
Temperature *uint
Utilization UtilizationInfo
Memory MemoryInfo
Clocks ClockInfo
PCI PCIStatusInfo
Processes []ProcessInfo
Throttle ThrottleReason
Performance PerfState
}
func assert(err error) {
if err != nil {
panic(err)
}
}
func Init() error {
return init_()
}
func Shutdown() error {
return shutdown()
}
func GetDeviceCount() (uint, error) {
return deviceGetCount()
}
func GetDriverVersion() (string, error) {
return systemGetDriverVersion()
}
func numaNode(busid string) (uint, error) {
// discard leading zeros of busid
b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
if err != nil {
// XXX report node 0 if NUMA support isn't enabled
return 0, nil
}
node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
if err != nil {
return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
}
if node < 0 {
node = 0 // XXX report node 0 instead of NUMA_NO_NODE
}
return uint(node), nil
}
func pciBandwidth(gen, width *uint) *uint {
m := map[uint]uint{
1: 250, // MB/s
2: 500,
3: 985,
4: 1969,
}
if gen == nil || width == nil {
return nil
}
bw := m[*gen] * *width
return &bw
}
func NewDevice(idx uint) (device *Device, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
h, err := deviceGetHandleByIndex(idx)
assert(err)
model, err := h.deviceGetName()
assert(err)
uuid, err := h.deviceGetUUID()
assert(err)
minor, err := h.deviceGetMinorNumber()
assert(err)
power, err := h.deviceGetPowerManagementLimit()
assert(err)
totalMem, _, err := h.deviceGetMemoryInfo()
assert(err)
busid, err := h.deviceGetPciInfo()
assert(err)
bar1, _, err := h.deviceGetBAR1MemoryInfo()
assert(err)
pcig, err := h.deviceGetMaxPcieLinkGeneration()
assert(err)
pciw, err := h.deviceGetMaxPcieLinkWidth()
assert(err)
ccore, cmem, err := h.deviceGetMaxClockInfo()
assert(err)
if minor == nil || busid == nil || uuid == nil {
return nil, ErrUnsupportedGPU
}
path := fmt.Sprintf("/dev/nvidia%d", *minor)
node, err := numaNode(*busid)
assert(err)
device = &Device{
handle: h,
UUID: *uuid,
Path: path,
Model: model,
Power: power,
Memory: totalMem,
CPUAffinity: &node,
PCI: PCIInfo{
BusID: *busid,
BAR1: bar1,
Bandwidth: pciBandwidth(pcig, pciw), // MB/s
},
Clocks: ClockInfo{
Cores: ccore, // MHz
Memory: cmem, // MHz
},
}
if power != nil {
*device.Power /= 1000 // W
}
if bar1 != nil {
*device.PCI.BAR1 /= 1024 * 1024 // MiB
}
return
}
func NewDeviceLite(idx uint) (device *Device, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
h, err := deviceGetHandleByIndex(idx)
assert(err)
uuid, err := h.deviceGetUUID()
assert(err)
minor, err := h.deviceGetMinorNumber()
assert(err)
busid, err := h.deviceGetPciInfo()
assert(err)
if minor == nil || busid == nil || uuid == nil {
return nil, ErrUnsupportedGPU
}
path := fmt.Sprintf("/dev/nvidia%d", *minor)
device = &Device{
handle: h,
UUID: *uuid,
Path: path,
PCI: PCIInfo{
BusID: *busid,
},
}
return
}
func (d *Device) Status() (status *DeviceStatus, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
power, err := d.deviceGetPowerUsage()
assert(err)
temp, err := d.deviceGetTemperature()
assert(err)
ugpu, umem, err := d.deviceGetUtilizationRates()
assert(err)
uenc, err := d.deviceGetEncoderUtilization()
assert(err)
udec, err := d.deviceGetDecoderUtilization()
assert(err)
_, devMem, err := d.deviceGetMemoryInfo()
assert(err)
ccore, cmem, err := d.deviceGetClockInfo()
assert(err)
_, bar1, err := d.deviceGetBAR1MemoryInfo()
assert(err)
el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
assert(err)
pcirx, pcitx, err := d.deviceGetPcieThroughput()
assert(err)
throttle, err := d.getClocksThrottleReasons()
assert(err)
perfState, err := d.getPerformanceState()
assert(err)
processInfo, err := d.deviceGetAllRunningProcesses()
assert(err)
status = &DeviceStatus{
Power: power,
Temperature: temp, // °C
Utilization: UtilizationInfo{
GPU: ugpu, // %
Memory: umem, // %
Encoder: uenc, // %
Decoder: udec, // %
},
Memory: MemoryInfo{
Global: devMem,
ECCErrors: ECCErrorsInfo{
L1Cache: el1,
L2Cache: el2,
Device: emem,
},
},
Clocks: ClockInfo{
Cores: ccore, // MHz
Memory: cmem, // MHz
},
PCI: PCIStatusInfo{
BAR1Used: bar1,
Throughput: PCIThroughputInfo{
RX: pcirx,
TX: pcitx,
},
},
Throttle: throttle,
Performance: perfState,
Processes: processInfo,
}
if power != nil {
*status.Power /= 1000 // W
}
if bar1 != nil {
*status.PCI.BAR1Used /= 1024 * 1024 // MiB
}
if pcirx != nil {
*status.PCI.Throughput.RX /= 1000 // MB/s
}
if pcitx != nil {
*status.PCI.Throughput.TX /= 1000 // MB/s
}
return
}
func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
if err != nil || level == nil {
return P2PLinkUnknown, err
}
switch *level {
case C.NVML_TOPOLOGY_INTERNAL:
link = P2PLinkSameBoard
case C.NVML_TOPOLOGY_SINGLE:
link = P2PLinkSingleSwitch
case C.NVML_TOPOLOGY_MULTIPLE:
link = P2PLinkMultiSwitch
case C.NVML_TOPOLOGY_HOSTBRIDGE:
link = P2PLinkHostBridge
case C.NVML_TOPOLOGY_CPU:
link = P2PLinkSameCPU
case C.NVML_TOPOLOGY_SYSTEM:
link = P2PLinkCrossCPU
default:
err = ErrUnsupportedP2PLink
}
return
}
func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
return d.handle.deviceGetComputeRunningProcesses()
}
func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
return d.handle.deviceGetGraphicsRunningProcesses()
}
func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
return d.handle.deviceGetAllRunningProcesses()
}
func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
display, err := d.getDisplayInfo()
assert(err)
p, err := d.getPeristenceMode()
assert(err)
accounting, err := d.getAccountingInfo()
assert(err)
mode = &DeviceMode{
DisplayInfo: display,
Persistence: p,
AccountingInfo: accounting,
}
return
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,46 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#include <stddef.h>
#include <dlfcn.h>
#include "nvml_dl.h"
#define DLSYM(x, sym) \
do { \
dlerror(); \
x = dlsym(handle, #sym); \
if (dlerror() != NULL) { \
return (NVML_ERROR_FUNCTION_NOT_FOUND); \
} \
} while (0)
typedef nvmlReturn_t (*nvmlSym_t)();
static void *handle;
nvmlReturn_t NVML_DL(nvmlInit)(void)
{
handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
if (handle == NULL) {
return (NVML_ERROR_LIBRARY_NOT_FOUND);
}
return (nvmlInit());
}
nvmlReturn_t NVML_DL(nvmlShutdown)(void)
{
nvmlReturn_t r = nvmlShutdown();
if (r != NVML_SUCCESS) {
return (r);
}
return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
}
nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
{
nvmlSym_t sym;
DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
return ((*sym)(dev1, dev2, info));
}

View File

@@ -0,0 +1,15 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#ifndef _NVML_DL_H_
#define _NVML_DL_H_
#include "nvml.h"
#define NVML_DL(x) x##_dl
extern nvmlReturn_t NVML_DL(nvmlInit)(void);
extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
#endif // _NVML_DL_H_