synchronization
This commit is contained in:
29
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
generated
vendored
Normal file
29
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2018, NVIDIA Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
634
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
generated
vendored
Normal file
634
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
generated
vendored
Normal file
@@ -0,0 +1,634 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
package nvml
|
||||
|
||||
// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
|
||||
// #include "nvml_dl.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
|
||||
szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
|
||||
szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
|
||||
szProcs = 32
|
||||
szProcName = 64
|
||||
|
||||
XidCriticalError = C.nvmlEventTypeXidCriticalError
|
||||
)
|
||||
|
||||
type handle struct{ dev C.nvmlDevice_t }
|
||||
type EventSet struct{ set C.nvmlEventSet_t }
|
||||
type Event struct {
|
||||
UUID *string
|
||||
Etype uint64
|
||||
Edata uint64
|
||||
}
|
||||
|
||||
func uintPtr(c C.uint) *uint {
|
||||
i := uint(c)
|
||||
return &i
|
||||
}
|
||||
|
||||
func uint64Ptr(c C.ulonglong) *uint64 {
|
||||
i := uint64(c)
|
||||
return &i
|
||||
}
|
||||
|
||||
func stringPtr(c *C.char) *string {
|
||||
s := C.GoString(c)
|
||||
return &s
|
||||
}
|
||||
|
||||
func errorString(ret C.nvmlReturn_t) error {
|
||||
if ret == C.NVML_SUCCESS {
|
||||
return nil
|
||||
}
|
||||
err := C.GoString(C.nvmlErrorString(ret))
|
||||
return fmt.Errorf("nvml: %v", err)
|
||||
}
|
||||
|
||||
func init_() error {
|
||||
r := C.nvmlInit_dl()
|
||||
if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
|
||||
return errors.New("could not load NVML library")
|
||||
}
|
||||
return errorString(r)
|
||||
}
|
||||
|
||||
func NewEventSet() EventSet {
|
||||
var set C.nvmlEventSet_t
|
||||
C.nvmlEventSetCreate(&set)
|
||||
|
||||
return EventSet{set}
|
||||
}
|
||||
|
||||
func RegisterEvent(es EventSet, event int) error {
|
||||
n, err := deviceGetCount()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var i uint
|
||||
for i = 0; i < n; i++ {
|
||||
h, err := deviceGetHandleByIndex(i)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
|
||||
if r != C.NVML_SUCCESS {
|
||||
return errorString(r)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func RegisterEventForDevice(es EventSet, event int, uuid string) error {
|
||||
n, err := deviceGetCount()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var i uint
|
||||
for i = 0; i < n; i++ {
|
||||
h, err := deviceGetHandleByIndex(i)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
duuid, err := h.deviceGetUUID()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if *duuid != uuid {
|
||||
continue
|
||||
}
|
||||
|
||||
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
|
||||
if r != C.NVML_SUCCESS {
|
||||
return errorString(r)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("nvml: device not found")
|
||||
}
|
||||
|
||||
func DeleteEventSet(es EventSet) {
|
||||
C.nvmlEventSetFree(es.set)
|
||||
}
|
||||
|
||||
func WaitForEvent(es EventSet, timeout uint) (Event, error) {
|
||||
var data C.nvmlEventData_t
|
||||
|
||||
r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
|
||||
uuid, _ := handle{data.device}.deviceGetUUID()
|
||||
|
||||
return Event{
|
||||
UUID: uuid,
|
||||
Etype: uint64(data.eventType),
|
||||
Edata: uint64(data.eventData),
|
||||
},
|
||||
errorString(r)
|
||||
}
|
||||
|
||||
func shutdown() error {
|
||||
return errorString(C.nvmlShutdown_dl())
|
||||
}
|
||||
|
||||
func systemGetDriverVersion() (string, error) {
|
||||
var driver [szDriver]C.char
|
||||
|
||||
r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
|
||||
return C.GoString(&driver[0]), errorString(r)
|
||||
}
|
||||
|
||||
func systemGetProcessName(pid uint) (string, error) {
|
||||
var proc [szProcName]C.char
|
||||
|
||||
r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
|
||||
return C.GoString(&proc[0]), errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetCount() (uint, error) {
|
||||
var n C.uint
|
||||
|
||||
r := C.nvmlDeviceGetCount(&n)
|
||||
return uint(n), errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetHandleByIndex(idx uint) (handle, error) {
|
||||
var dev C.nvmlDevice_t
|
||||
|
||||
r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
|
||||
return handle{dev}, errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
|
||||
var level C.nvmlGpuTopologyLevel_t
|
||||
|
||||
r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
|
||||
if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(C.uint(level)), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetName() (*string, error) {
|
||||
var name [szName]C.char
|
||||
|
||||
r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&name[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetUUID() (*string, error) {
|
||||
var uuid [szUUID]C.char
|
||||
|
||||
r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&uuid[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPciInfo() (*string, error) {
|
||||
var pci C.nvmlPciInfo_t
|
||||
|
||||
r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&pci.busId[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMinorNumber() (*uint, error) {
|
||||
var minor C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(minor), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
|
||||
var bar1 C.nvmlBAR1Memory_t
|
||||
|
||||
r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
|
||||
var power C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(power), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
|
||||
var sm, mem C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
|
||||
}
|
||||
return uintPtr(sm), uintPtr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
|
||||
var link C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(link), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
|
||||
var width C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(width), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPowerUsage() (*uint, error) {
|
||||
var power C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(power), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetTemperature() (*uint, error) {
|
||||
var temp C.uint
|
||||
|
||||
r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(temp), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
|
||||
var usage C.nvmlUtilization_t
|
||||
|
||||
r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetEncoderUtilization() (*uint, error) {
|
||||
var usage, sampling C.uint
|
||||
|
||||
r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(usage), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetDecoderUtilization() (*uint, error) {
|
||||
var usage, sampling C.uint
|
||||
|
||||
r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(usage), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
|
||||
var mem C.nvmlMemory_t
|
||||
|
||||
r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
err = errorString(r)
|
||||
if r != C.NVML_SUCCESS {
|
||||
return
|
||||
}
|
||||
|
||||
totalMem = uint64Ptr(mem.total)
|
||||
if totalMem != nil {
|
||||
*totalMem /= 1024 * 1024 // MiB
|
||||
}
|
||||
|
||||
devMem = DeviceMemory{
|
||||
Used: uint64Ptr(mem.used),
|
||||
Free: uint64Ptr(mem.free),
|
||||
}
|
||||
|
||||
if devMem.Used != nil {
|
||||
*devMem.Used /= 1024 * 1024 // MiB
|
||||
}
|
||||
|
||||
if devMem.Free != nil {
|
||||
*devMem.Free /= 1024 * 1024 // MiB
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
|
||||
var sm, mem C.uint
|
||||
|
||||
r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
|
||||
}
|
||||
return uintPtr(sm), uintPtr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
|
||||
var l1, l2, mem C.ulonglong
|
||||
|
||||
r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
|
||||
}
|
||||
return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
|
||||
var rx, tx C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
|
||||
}
|
||||
return uintPtr(rx), uintPtr(tx), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
|
||||
var procs [szProcs]C.nvmlProcessInfo_t
|
||||
var count = C.uint(szProcs)
|
||||
|
||||
r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
n := int(count)
|
||||
pids := make([]uint, n)
|
||||
mems := make([]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
pids[i] = uint(procs[i].pid)
|
||||
mems[i] = uint64(procs[i].usedGpuMemory)
|
||||
}
|
||||
return pids, mems, errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
|
||||
var procs [szProcs]C.nvmlProcessInfo_t
|
||||
var count = C.uint(szProcs)
|
||||
|
||||
r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
n := int(count)
|
||||
pids := make([]uint, n)
|
||||
mems := make([]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
pids[i] = uint(procs[i].pid)
|
||||
mems[i] = uint64(procs[i].usedGpuMemory)
|
||||
}
|
||||
return pids, mems, errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
|
||||
cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
allPids := make(map[uint]ProcessInfo)
|
||||
|
||||
for i, pid := range cPids {
|
||||
name, err := processName(pid)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
allPids[pid] = ProcessInfo{
|
||||
PID: pid,
|
||||
Name: name,
|
||||
MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
|
||||
Type: Compute,
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for i, pid := range gPids {
|
||||
pInfo, exists := allPids[pid]
|
||||
if exists {
|
||||
pInfo.Type = ComputeAndGraphics
|
||||
allPids[pid] = pInfo
|
||||
} else {
|
||||
name, err := processName(pid)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
allPids[pid] = ProcessInfo{
|
||||
PID: pid,
|
||||
Name: name,
|
||||
MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
|
||||
Type: Graphics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var processInfo []ProcessInfo
|
||||
for _, v := range allPids {
|
||||
processInfo = append(processInfo, v)
|
||||
}
|
||||
sort.Slice(processInfo, func(i, j int) bool {
|
||||
return processInfo[i].PID < processInfo[j].PID
|
||||
})
|
||||
|
||||
return processInfo, nil
|
||||
}
|
||||
|
||||
func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
|
||||
var clocksThrottleReasons C.ulonglong
|
||||
|
||||
r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
|
||||
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return ThrottleReasonUnknown, nil
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return ThrottleReasonUnknown, errorString(r)
|
||||
}
|
||||
|
||||
switch clocksThrottleReasons {
|
||||
case C.nvmlClocksThrottleReasonGpuIdle:
|
||||
reason = ThrottleReasonGpuIdle
|
||||
case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
|
||||
reason = ThrottleReasonApplicationsClocksSetting
|
||||
case C.nvmlClocksThrottleReasonSwPowerCap:
|
||||
reason = ThrottleReasonSwPowerCap
|
||||
case C.nvmlClocksThrottleReasonHwSlowdown:
|
||||
reason = ThrottleReasonHwSlowdown
|
||||
case C.nvmlClocksThrottleReasonSyncBoost:
|
||||
reason = ThrottleReasonSyncBoost
|
||||
case C.nvmlClocksThrottleReasonSwThermalSlowdown:
|
||||
reason = ThrottleReasonSwThermalSlowdown
|
||||
case C.nvmlClocksThrottleReasonHwThermalSlowdown:
|
||||
reason = ThrottleReasonHwThermalSlowdown
|
||||
case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
|
||||
reason = ThrottleReasonHwPowerBrakeSlowdown
|
||||
case C.nvmlClocksThrottleReasonDisplayClockSetting:
|
||||
reason = ThrottleReasonDisplayClockSetting
|
||||
case C.nvmlClocksThrottleReasonNone:
|
||||
reason = ThrottleReasonNone
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) getPerformanceState() (PerfState, error) {
|
||||
var pstate C.nvmlPstates_t
|
||||
|
||||
r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
|
||||
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return PerfStateUnknown, nil
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return PerfStateUnknown, errorString(r)
|
||||
}
|
||||
return PerfState(pstate), nil
|
||||
}
|
||||
|
||||
func processName(pid uint) (string, error) {
|
||||
f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
|
||||
d, err := ioutil.ReadFile(f)
|
||||
|
||||
if err != nil {
|
||||
// TOCTOU: process terminated
|
||||
if os.IsNotExist(err) {
|
||||
return "", nil
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(d), "\n"), err
|
||||
}
|
||||
|
||||
func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
|
||||
var mode C.nvmlEnableState_t
|
||||
var buffer C.uint
|
||||
|
||||
r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return accountingInfo, errorString(r)
|
||||
}
|
||||
|
||||
r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return accountingInfo, errorString(r)
|
||||
}
|
||||
|
||||
accountingInfo = Accounting{
|
||||
Mode: ModeState(mode),
|
||||
BufferSize: uintPtr(buffer),
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) getDisplayInfo() (display Display, err error) {
|
||||
var mode, isActive C.nvmlEnableState_t
|
||||
|
||||
r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return display, errorString(r)
|
||||
}
|
||||
|
||||
r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
if r != C.NVML_SUCCESS {
|
||||
return display, errorString(r)
|
||||
}
|
||||
display = Display{
|
||||
Mode: ModeState(mode),
|
||||
Active: ModeState(isActive),
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) getPeristenceMode() (state ModeState, err error) {
|
||||
var mode C.nvmlEnableState_t
|
||||
|
||||
r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
return ModeState(mode), errorString(r)
|
||||
}
|
533
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
generated
vendored
Normal file
533
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
generated
vendored
Normal file
@@ -0,0 +1,533 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
package nvml
|
||||
|
||||
// #include "nvml_dl.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
|
||||
ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
|
||||
ErrUnsupportedGPU = errors.New("unsupported GPU device")
|
||||
)
|
||||
|
||||
type ModeState uint
|
||||
|
||||
const (
|
||||
Enabled ModeState = iota
|
||||
Disabled
|
||||
)
|
||||
|
||||
func (m ModeState) String() string {
|
||||
switch m {
|
||||
case Enabled:
|
||||
return "Enabled"
|
||||
case Disabled:
|
||||
return "Disabled"
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type Display struct {
|
||||
Mode ModeState
|
||||
Active ModeState
|
||||
}
|
||||
|
||||
type Accounting struct {
|
||||
Mode ModeState
|
||||
BufferSize *uint
|
||||
}
|
||||
|
||||
type DeviceMode struct {
|
||||
DisplayInfo Display
|
||||
Persistence ModeState
|
||||
AccountingInfo Accounting
|
||||
}
|
||||
|
||||
type ThrottleReason uint
|
||||
|
||||
const (
|
||||
ThrottleReasonGpuIdle ThrottleReason = iota
|
||||
ThrottleReasonApplicationsClocksSetting
|
||||
ThrottleReasonSwPowerCap
|
||||
ThrottleReasonHwSlowdown
|
||||
ThrottleReasonSyncBoost
|
||||
ThrottleReasonSwThermalSlowdown
|
||||
ThrottleReasonHwThermalSlowdown
|
||||
ThrottleReasonHwPowerBrakeSlowdown
|
||||
ThrottleReasonDisplayClockSetting
|
||||
ThrottleReasonNone
|
||||
ThrottleReasonUnknown
|
||||
)
|
||||
|
||||
func (r ThrottleReason) String() string {
|
||||
switch r {
|
||||
case ThrottleReasonGpuIdle:
|
||||
return "Gpu Idle"
|
||||
case ThrottleReasonApplicationsClocksSetting:
|
||||
return "Applications Clocks Setting"
|
||||
case ThrottleReasonSwPowerCap:
|
||||
return "SW Power Cap"
|
||||
case ThrottleReasonHwSlowdown:
|
||||
return "HW Slowdown"
|
||||
case ThrottleReasonSyncBoost:
|
||||
return "Sync Boost"
|
||||
case ThrottleReasonSwThermalSlowdown:
|
||||
return "SW Thermal Slowdown"
|
||||
case ThrottleReasonHwThermalSlowdown:
|
||||
return "HW Thermal Slowdown"
|
||||
case ThrottleReasonHwPowerBrakeSlowdown:
|
||||
return "HW Power Brake Slowdown"
|
||||
case ThrottleReasonDisplayClockSetting:
|
||||
return "Display Clock Setting"
|
||||
case ThrottleReasonNone:
|
||||
return "No clocks throttling"
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type PerfState uint
|
||||
|
||||
const (
|
||||
PerfStateMax = 0
|
||||
PerfStateMin = 15
|
||||
PerfStateUnknown = 32
|
||||
)
|
||||
|
||||
func (p PerfState) String() string {
|
||||
if p >= PerfStateMax && p <= PerfStateMin {
|
||||
return fmt.Sprintf("P%d", p)
|
||||
}
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
type ProcessType uint
|
||||
|
||||
const (
|
||||
Compute ProcessType = iota
|
||||
Graphics
|
||||
ComputeAndGraphics
|
||||
)
|
||||
|
||||
func (t ProcessType) String() string {
|
||||
typ := "C+G"
|
||||
if t == Compute {
|
||||
typ = "C"
|
||||
} else if t == Graphics {
|
||||
typ = "G"
|
||||
}
|
||||
return typ
|
||||
}
|
||||
|
||||
type P2PLinkType uint
|
||||
|
||||
const (
|
||||
P2PLinkUnknown P2PLinkType = iota
|
||||
P2PLinkCrossCPU
|
||||
P2PLinkSameCPU
|
||||
P2PLinkHostBridge
|
||||
P2PLinkMultiSwitch
|
||||
P2PLinkSingleSwitch
|
||||
P2PLinkSameBoard
|
||||
)
|
||||
|
||||
type P2PLink struct {
|
||||
BusID string
|
||||
Link P2PLinkType
|
||||
}
|
||||
|
||||
func (t P2PLinkType) String() string {
|
||||
switch t {
|
||||
case P2PLinkCrossCPU:
|
||||
return "Cross CPU socket"
|
||||
case P2PLinkSameCPU:
|
||||
return "Same CPU socket"
|
||||
case P2PLinkHostBridge:
|
||||
return "Host PCI bridge"
|
||||
case P2PLinkMultiSwitch:
|
||||
return "Multiple PCI switches"
|
||||
case P2PLinkSingleSwitch:
|
||||
return "Single PCI switch"
|
||||
case P2PLinkSameBoard:
|
||||
return "Same board"
|
||||
case P2PLinkUnknown:
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type ClockInfo struct {
|
||||
Cores *uint
|
||||
Memory *uint
|
||||
}
|
||||
|
||||
type PCIInfo struct {
|
||||
BusID string
|
||||
BAR1 *uint64
|
||||
Bandwidth *uint
|
||||
}
|
||||
|
||||
type Device struct {
|
||||
handle
|
||||
|
||||
UUID string
|
||||
Path string
|
||||
Model *string
|
||||
Power *uint
|
||||
Memory *uint64
|
||||
CPUAffinity *uint
|
||||
PCI PCIInfo
|
||||
Clocks ClockInfo
|
||||
Topology []P2PLink
|
||||
}
|
||||
|
||||
type UtilizationInfo struct {
|
||||
GPU *uint
|
||||
Memory *uint
|
||||
Encoder *uint
|
||||
Decoder *uint
|
||||
}
|
||||
|
||||
type PCIThroughputInfo struct {
|
||||
RX *uint
|
||||
TX *uint
|
||||
}
|
||||
|
||||
type PCIStatusInfo struct {
|
||||
BAR1Used *uint64
|
||||
Throughput PCIThroughputInfo
|
||||
}
|
||||
|
||||
type ECCErrorsInfo struct {
|
||||
L1Cache *uint64
|
||||
L2Cache *uint64
|
||||
Device *uint64
|
||||
}
|
||||
|
||||
type DeviceMemory struct {
|
||||
Used *uint64
|
||||
Free *uint64
|
||||
}
|
||||
|
||||
type MemoryInfo struct {
|
||||
Global DeviceMemory
|
||||
ECCErrors ECCErrorsInfo
|
||||
}
|
||||
|
||||
type ProcessInfo struct {
|
||||
PID uint
|
||||
Name string
|
||||
MemoryUsed uint64
|
||||
Type ProcessType
|
||||
}
|
||||
|
||||
type DeviceStatus struct {
|
||||
Power *uint
|
||||
Temperature *uint
|
||||
Utilization UtilizationInfo
|
||||
Memory MemoryInfo
|
||||
Clocks ClockInfo
|
||||
PCI PCIStatusInfo
|
||||
Processes []ProcessInfo
|
||||
Throttle ThrottleReason
|
||||
Performance PerfState
|
||||
}
|
||||
|
||||
func assert(err error) {
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func Init() error {
|
||||
return init_()
|
||||
}
|
||||
|
||||
func Shutdown() error {
|
||||
return shutdown()
|
||||
}
|
||||
|
||||
func GetDeviceCount() (uint, error) {
|
||||
return deviceGetCount()
|
||||
}
|
||||
|
||||
func GetDriverVersion() (string, error) {
|
||||
return systemGetDriverVersion()
|
||||
}
|
||||
|
||||
func numaNode(busid string) (uint, error) {
|
||||
// discard leading zeros of busid
|
||||
b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
|
||||
if err != nil {
|
||||
// XXX report node 0 if NUMA support isn't enabled
|
||||
return 0, nil
|
||||
}
|
||||
node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
|
||||
}
|
||||
if node < 0 {
|
||||
node = 0 // XXX report node 0 instead of NUMA_NO_NODE
|
||||
}
|
||||
return uint(node), nil
|
||||
}
|
||||
|
||||
func pciBandwidth(gen, width *uint) *uint {
|
||||
m := map[uint]uint{
|
||||
1: 250, // MB/s
|
||||
2: 500,
|
||||
3: 985,
|
||||
4: 1969,
|
||||
}
|
||||
if gen == nil || width == nil {
|
||||
return nil
|
||||
}
|
||||
bw := m[*gen] * *width
|
||||
return &bw
|
||||
}
|
||||
|
||||
func NewDevice(idx uint) (device *Device, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
h, err := deviceGetHandleByIndex(idx)
|
||||
assert(err)
|
||||
model, err := h.deviceGetName()
|
||||
assert(err)
|
||||
uuid, err := h.deviceGetUUID()
|
||||
assert(err)
|
||||
minor, err := h.deviceGetMinorNumber()
|
||||
assert(err)
|
||||
power, err := h.deviceGetPowerManagementLimit()
|
||||
assert(err)
|
||||
totalMem, _, err := h.deviceGetMemoryInfo()
|
||||
assert(err)
|
||||
busid, err := h.deviceGetPciInfo()
|
||||
assert(err)
|
||||
bar1, _, err := h.deviceGetBAR1MemoryInfo()
|
||||
assert(err)
|
||||
pcig, err := h.deviceGetMaxPcieLinkGeneration()
|
||||
assert(err)
|
||||
pciw, err := h.deviceGetMaxPcieLinkWidth()
|
||||
assert(err)
|
||||
ccore, cmem, err := h.deviceGetMaxClockInfo()
|
||||
assert(err)
|
||||
|
||||
if minor == nil || busid == nil || uuid == nil {
|
||||
return nil, ErrUnsupportedGPU
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
||||
node, err := numaNode(*busid)
|
||||
assert(err)
|
||||
|
||||
device = &Device{
|
||||
handle: h,
|
||||
UUID: *uuid,
|
||||
Path: path,
|
||||
Model: model,
|
||||
Power: power,
|
||||
Memory: totalMem,
|
||||
CPUAffinity: &node,
|
||||
PCI: PCIInfo{
|
||||
BusID: *busid,
|
||||
BAR1: bar1,
|
||||
Bandwidth: pciBandwidth(pcig, pciw), // MB/s
|
||||
},
|
||||
Clocks: ClockInfo{
|
||||
Cores: ccore, // MHz
|
||||
Memory: cmem, // MHz
|
||||
},
|
||||
}
|
||||
if power != nil {
|
||||
*device.Power /= 1000 // W
|
||||
}
|
||||
if bar1 != nil {
|
||||
*device.PCI.BAR1 /= 1024 * 1024 // MiB
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func NewDeviceLite(idx uint) (device *Device, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
h, err := deviceGetHandleByIndex(idx)
|
||||
assert(err)
|
||||
uuid, err := h.deviceGetUUID()
|
||||
assert(err)
|
||||
minor, err := h.deviceGetMinorNumber()
|
||||
assert(err)
|
||||
busid, err := h.deviceGetPciInfo()
|
||||
assert(err)
|
||||
|
||||
if minor == nil || busid == nil || uuid == nil {
|
||||
return nil, ErrUnsupportedGPU
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
||||
|
||||
device = &Device{
|
||||
handle: h,
|
||||
UUID: *uuid,
|
||||
Path: path,
|
||||
PCI: PCIInfo{
|
||||
BusID: *busid,
|
||||
},
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (d *Device) Status() (status *DeviceStatus, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
power, err := d.deviceGetPowerUsage()
|
||||
assert(err)
|
||||
temp, err := d.deviceGetTemperature()
|
||||
assert(err)
|
||||
ugpu, umem, err := d.deviceGetUtilizationRates()
|
||||
assert(err)
|
||||
uenc, err := d.deviceGetEncoderUtilization()
|
||||
assert(err)
|
||||
udec, err := d.deviceGetDecoderUtilization()
|
||||
assert(err)
|
||||
_, devMem, err := d.deviceGetMemoryInfo()
|
||||
assert(err)
|
||||
ccore, cmem, err := d.deviceGetClockInfo()
|
||||
assert(err)
|
||||
_, bar1, err := d.deviceGetBAR1MemoryInfo()
|
||||
assert(err)
|
||||
el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
|
||||
assert(err)
|
||||
pcirx, pcitx, err := d.deviceGetPcieThroughput()
|
||||
assert(err)
|
||||
throttle, err := d.getClocksThrottleReasons()
|
||||
assert(err)
|
||||
perfState, err := d.getPerformanceState()
|
||||
assert(err)
|
||||
processInfo, err := d.deviceGetAllRunningProcesses()
|
||||
assert(err)
|
||||
|
||||
status = &DeviceStatus{
|
||||
Power: power,
|
||||
Temperature: temp, // °C
|
||||
Utilization: UtilizationInfo{
|
||||
GPU: ugpu, // %
|
||||
Memory: umem, // %
|
||||
Encoder: uenc, // %
|
||||
Decoder: udec, // %
|
||||
},
|
||||
Memory: MemoryInfo{
|
||||
Global: devMem,
|
||||
ECCErrors: ECCErrorsInfo{
|
||||
L1Cache: el1,
|
||||
L2Cache: el2,
|
||||
Device: emem,
|
||||
},
|
||||
},
|
||||
Clocks: ClockInfo{
|
||||
Cores: ccore, // MHz
|
||||
Memory: cmem, // MHz
|
||||
},
|
||||
PCI: PCIStatusInfo{
|
||||
BAR1Used: bar1,
|
||||
Throughput: PCIThroughputInfo{
|
||||
RX: pcirx,
|
||||
TX: pcitx,
|
||||
},
|
||||
},
|
||||
Throttle: throttle,
|
||||
Performance: perfState,
|
||||
Processes: processInfo,
|
||||
}
|
||||
if power != nil {
|
||||
*status.Power /= 1000 // W
|
||||
}
|
||||
if bar1 != nil {
|
||||
*status.PCI.BAR1Used /= 1024 * 1024 // MiB
|
||||
}
|
||||
if pcirx != nil {
|
||||
*status.PCI.Throughput.RX /= 1000 // MB/s
|
||||
}
|
||||
if pcitx != nil {
|
||||
*status.PCI.Throughput.TX /= 1000 // MB/s
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
|
||||
level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
|
||||
if err != nil || level == nil {
|
||||
return P2PLinkUnknown, err
|
||||
}
|
||||
|
||||
switch *level {
|
||||
case C.NVML_TOPOLOGY_INTERNAL:
|
||||
link = P2PLinkSameBoard
|
||||
case C.NVML_TOPOLOGY_SINGLE:
|
||||
link = P2PLinkSingleSwitch
|
||||
case C.NVML_TOPOLOGY_MULTIPLE:
|
||||
link = P2PLinkMultiSwitch
|
||||
case C.NVML_TOPOLOGY_HOSTBRIDGE:
|
||||
link = P2PLinkHostBridge
|
||||
case C.NVML_TOPOLOGY_CPU:
|
||||
link = P2PLinkSameCPU
|
||||
case C.NVML_TOPOLOGY_SYSTEM:
|
||||
link = P2PLinkCrossCPU
|
||||
default:
|
||||
err = ErrUnsupportedP2PLink
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
|
||||
return d.handle.deviceGetComputeRunningProcesses()
|
||||
}
|
||||
|
||||
func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
|
||||
return d.handle.deviceGetGraphicsRunningProcesses()
|
||||
}
|
||||
|
||||
func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
|
||||
return d.handle.deviceGetAllRunningProcesses()
|
||||
}
|
||||
|
||||
func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
display, err := d.getDisplayInfo()
|
||||
assert(err)
|
||||
|
||||
p, err := d.getPeristenceMode()
|
||||
assert(err)
|
||||
|
||||
accounting, err := d.getAccountingInfo()
|
||||
assert(err)
|
||||
|
||||
mode = &DeviceMode{
|
||||
DisplayInfo: display,
|
||||
Persistence: p,
|
||||
AccountingInfo: accounting,
|
||||
}
|
||||
return
|
||||
}
|
5871
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
generated
vendored
Normal file
5871
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
46
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
generated
vendored
Normal file
46
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
generated
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include "nvml_dl.h"
|
||||
|
||||
#define DLSYM(x, sym) \
|
||||
do { \
|
||||
dlerror(); \
|
||||
x = dlsym(handle, #sym); \
|
||||
if (dlerror() != NULL) { \
|
||||
return (NVML_ERROR_FUNCTION_NOT_FOUND); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
typedef nvmlReturn_t (*nvmlSym_t)();
|
||||
|
||||
static void *handle;
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlInit)(void)
|
||||
{
|
||||
handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
|
||||
if (handle == NULL) {
|
||||
return (NVML_ERROR_LIBRARY_NOT_FOUND);
|
||||
}
|
||||
return (nvmlInit());
|
||||
}
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlShutdown)(void)
|
||||
{
|
||||
nvmlReturn_t r = nvmlShutdown();
|
||||
if (r != NVML_SUCCESS) {
|
||||
return (r);
|
||||
}
|
||||
return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
|
||||
nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
|
||||
{
|
||||
nvmlSym_t sym;
|
||||
|
||||
DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
|
||||
return ((*sym)(dev1, dev2, info));
|
||||
}
|
15
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
generated
vendored
Normal file
15
gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
#ifndef _NVML_DL_H_
|
||||
#define _NVML_DL_H_
|
||||
|
||||
#include "nvml.h"
|
||||
|
||||
#define NVML_DL(x) x##_dl
|
||||
|
||||
extern nvmlReturn_t NVML_DL(nvmlInit)(void);
|
||||
extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
|
||||
extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
|
||||
nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
|
||||
|
||||
#endif // _NVML_DL_H_
|
25
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/LICENSE
generated
vendored
Normal file
25
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
36
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/deb/copyright
generated
vendored
Normal file
36
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/deb/copyright
generated
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: #PACKAGE#
|
||||
Source: https://github.com/NVIDIA/nvidia-docker
|
||||
|
||||
Files: *
|
||||
Copyright: #YEAR# #USERNAME# <#EMAIL#>
|
||||
License: BSD-3-Clause
|
||||
|
||||
Files: debian/*
|
||||
Copyright: #YEAR# #USERNAME# <#EMAIL#>
|
||||
License: BSD-3-Clause
|
||||
|
||||
License: BSD-3-Clause
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of #USERNAME# nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
1
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/deb/nvidia-docker.service
generated
vendored
Normal file
1
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/deb/nvidia-docker.service
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
../common/nvidia-docker.service
|
25
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/rpm/SOURCES/LICENSE
generated
vendored
Normal file
25
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/rpm/SOURCES/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
1
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/rpm/SOURCES/nvidia-docker.service
generated
vendored
Normal file
1
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/build/rpm/SOURCES/nvidia-docker.service
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
../../common/nvidia-docker.service
|
311
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/bindings.go
generated
vendored
Normal file
311
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/bindings.go
generated
vendored
Normal file
@@ -0,0 +1,311 @@
|
||||
// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
package nvml
|
||||
|
||||
// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
|
||||
// #include "nvml_dl.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
const (
|
||||
szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
|
||||
szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
|
||||
szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
|
||||
szProcs = 32
|
||||
szProcName = 64
|
||||
)
|
||||
|
||||
type handle struct{ dev C.nvmlDevice_t }
|
||||
|
||||
func uintPtr(c C.uint) *uint {
|
||||
i := uint(c)
|
||||
return &i
|
||||
}
|
||||
|
||||
func uint64Ptr(c C.ulonglong) *uint64 {
|
||||
i := uint64(c)
|
||||
return &i
|
||||
}
|
||||
|
||||
func stringPtr(c *C.char) *string {
|
||||
s := C.GoString(c)
|
||||
return &s
|
||||
}
|
||||
|
||||
func errorString(ret C.nvmlReturn_t) error {
|
||||
if ret == C.NVML_SUCCESS {
|
||||
return nil
|
||||
}
|
||||
err := C.GoString(C.nvmlErrorString(ret))
|
||||
return fmt.Errorf("nvml: %v", err)
|
||||
}
|
||||
|
||||
func init_() error {
|
||||
r := C.nvmlInit_dl()
|
||||
if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
|
||||
return errors.New("could not load NVML library")
|
||||
}
|
||||
return errorString(r)
|
||||
}
|
||||
|
||||
func shutdown() error {
|
||||
return errorString(C.nvmlShutdown_dl())
|
||||
}
|
||||
|
||||
func systemGetDriverVersion() (string, error) {
|
||||
var driver [szDriver]C.char
|
||||
|
||||
r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
|
||||
return C.GoString(&driver[0]), errorString(r)
|
||||
}
|
||||
|
||||
func systemGetProcessName(pid uint) (string, error) {
|
||||
var proc [szProcName]C.char
|
||||
|
||||
r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
|
||||
return C.GoString(&proc[0]), errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetCount() (uint, error) {
|
||||
var n C.uint
|
||||
|
||||
r := C.nvmlDeviceGetCount(&n)
|
||||
return uint(n), errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetHandleByIndex(idx uint) (handle, error) {
|
||||
var dev C.nvmlDevice_t
|
||||
|
||||
r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
|
||||
return handle{dev}, errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
|
||||
var level C.nvmlGpuTopologyLevel_t
|
||||
|
||||
r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
|
||||
if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(C.uint(level)), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetName() (*string, error) {
|
||||
var name [szName]C.char
|
||||
|
||||
r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&name[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetUUID() (*string, error) {
|
||||
var uuid [szUUID]C.char
|
||||
|
||||
r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&uuid[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPciInfo() (*string, error) {
|
||||
var pci C.nvmlPciInfo_t
|
||||
|
||||
r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&pci.busId[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMinorNumber() (*uint, error) {
|
||||
var minor C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(minor), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
|
||||
var bar1 C.nvmlBAR1Memory_t
|
||||
|
||||
r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
|
||||
var power C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(power), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
|
||||
var sm, mem C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
|
||||
}
|
||||
return uintPtr(sm), uintPtr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
|
||||
var link C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(link), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
|
||||
var width C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(width), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPowerUsage() (*uint, error) {
|
||||
var power C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(power), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetTemperature() (*uint, error) {
|
||||
var temp C.uint
|
||||
|
||||
r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(temp), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
|
||||
var usage C.nvmlUtilization_t
|
||||
|
||||
r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetEncoderUtilization() (*uint, error) {
|
||||
var usage, sampling C.uint
|
||||
|
||||
r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(usage), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetDecoderUtilization() (*uint, error) {
|
||||
var usage, sampling C.uint
|
||||
|
||||
r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(usage), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMemoryInfo() (*uint64, error) {
|
||||
var mem C.nvmlMemory_t
|
||||
|
||||
r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uint64Ptr(mem.used), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
|
||||
var sm, mem C.uint
|
||||
|
||||
r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
|
||||
}
|
||||
return uintPtr(sm), uintPtr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
|
||||
var l1, l2, mem C.ulonglong
|
||||
|
||||
r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
|
||||
}
|
||||
return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
|
||||
var rx, tx C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
|
||||
}
|
||||
return uintPtr(rx), uintPtr(tx), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
|
||||
var procs [szProcs]C.nvmlProcessInfo_t
|
||||
var count = C.uint(szProcs)
|
||||
|
||||
r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
n := int(count)
|
||||
pids := make([]uint, n)
|
||||
mems := make([]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
pids[i] = uint(procs[i].pid)
|
||||
mems[i] = uint64(procs[i].usedGpuMemory)
|
||||
}
|
||||
return pids, mems, errorString(r)
|
||||
}
|
381
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/nvml.go
generated
vendored
Normal file
381
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/nvml.go
generated
vendored
Normal file
@@ -0,0 +1,381 @@
|
||||
// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
package nvml
|
||||
|
||||
// #include "nvml_dl.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
|
||||
ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
|
||||
ErrUnsupportedGPU = errors.New("unsupported GPU device")
|
||||
)
|
||||
|
||||
type P2PLinkType uint
|
||||
|
||||
const (
|
||||
P2PLinkUnknown P2PLinkType = iota
|
||||
P2PLinkCrossCPU
|
||||
P2PLinkSameCPU
|
||||
P2PLinkHostBridge
|
||||
P2PLinkMultiSwitch
|
||||
P2PLinkSingleSwitch
|
||||
P2PLinkSameBoard
|
||||
)
|
||||
|
||||
type P2PLink struct {
|
||||
BusID string
|
||||
Link P2PLinkType
|
||||
}
|
||||
|
||||
func (t P2PLinkType) String() string {
|
||||
switch t {
|
||||
case P2PLinkCrossCPU:
|
||||
return "Cross CPU socket"
|
||||
case P2PLinkSameCPU:
|
||||
return "Same CPU socket"
|
||||
case P2PLinkHostBridge:
|
||||
return "Host PCI bridge"
|
||||
case P2PLinkMultiSwitch:
|
||||
return "Multiple PCI switches"
|
||||
case P2PLinkSingleSwitch:
|
||||
return "Single PCI switch"
|
||||
case P2PLinkSameBoard:
|
||||
return "Same board"
|
||||
case P2PLinkUnknown:
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type ClockInfo struct {
|
||||
Cores *uint
|
||||
Memory *uint
|
||||
}
|
||||
|
||||
type PCIInfo struct {
|
||||
BusID string
|
||||
BAR1 *uint64
|
||||
Bandwidth *uint
|
||||
}
|
||||
|
||||
type Device struct {
|
||||
handle
|
||||
|
||||
UUID string
|
||||
Path string
|
||||
Model *string
|
||||
Power *uint
|
||||
CPUAffinity *uint
|
||||
PCI PCIInfo
|
||||
Clocks ClockInfo
|
||||
Topology []P2PLink
|
||||
}
|
||||
|
||||
type UtilizationInfo struct {
|
||||
GPU *uint
|
||||
Memory *uint
|
||||
Encoder *uint
|
||||
Decoder *uint
|
||||
}
|
||||
|
||||
type PCIThroughputInfo struct {
|
||||
RX *uint
|
||||
TX *uint
|
||||
}
|
||||
|
||||
type PCIStatusInfo struct {
|
||||
BAR1Used *uint64
|
||||
Throughput PCIThroughputInfo
|
||||
}
|
||||
|
||||
type ECCErrorsInfo struct {
|
||||
L1Cache *uint64
|
||||
L2Cache *uint64
|
||||
Global *uint64
|
||||
}
|
||||
|
||||
type MemoryInfo struct {
|
||||
GlobalUsed *uint64
|
||||
ECCErrors ECCErrorsInfo
|
||||
}
|
||||
|
||||
type ProcessInfo struct {
|
||||
PID uint
|
||||
Name string
|
||||
MemoryUsed uint64
|
||||
}
|
||||
|
||||
type DeviceStatus struct {
|
||||
Power *uint
|
||||
Temperature *uint
|
||||
Utilization UtilizationInfo
|
||||
Memory MemoryInfo
|
||||
Clocks ClockInfo
|
||||
PCI PCIStatusInfo
|
||||
Processes []ProcessInfo
|
||||
}
|
||||
|
||||
func assert(err error) {
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func Init() error {
|
||||
return init_()
|
||||
}
|
||||
|
||||
func Shutdown() error {
|
||||
return shutdown()
|
||||
}
|
||||
|
||||
func GetDeviceCount() (uint, error) {
|
||||
return deviceGetCount()
|
||||
}
|
||||
|
||||
func GetDriverVersion() (string, error) {
|
||||
return systemGetDriverVersion()
|
||||
}
|
||||
|
||||
func numaNode(busid string) (uint, error) {
|
||||
b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid)))
|
||||
if err != nil {
|
||||
// XXX report node 0 if NUMA support isn't enabled
|
||||
return 0, nil
|
||||
}
|
||||
node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
|
||||
}
|
||||
if node < 0 {
|
||||
node = 0 // XXX report node 0 instead of NUMA_NO_NODE
|
||||
}
|
||||
return uint(node), nil
|
||||
}
|
||||
|
||||
func pciBandwidth(gen, width *uint) *uint {
|
||||
m := map[uint]uint{
|
||||
1: 250, // MB/s
|
||||
2: 500,
|
||||
3: 985,
|
||||
4: 1969,
|
||||
}
|
||||
if gen == nil || width == nil {
|
||||
return nil
|
||||
}
|
||||
bw := m[*gen] * *width
|
||||
return &bw
|
||||
}
|
||||
|
||||
func NewDevice(idx uint) (device *Device, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
h, err := deviceGetHandleByIndex(idx)
|
||||
assert(err)
|
||||
model, err := h.deviceGetName()
|
||||
assert(err)
|
||||
uuid, err := h.deviceGetUUID()
|
||||
assert(err)
|
||||
minor, err := h.deviceGetMinorNumber()
|
||||
assert(err)
|
||||
power, err := h.deviceGetPowerManagementLimit()
|
||||
assert(err)
|
||||
busid, err := h.deviceGetPciInfo()
|
||||
assert(err)
|
||||
bar1, _, err := h.deviceGetBAR1MemoryInfo()
|
||||
assert(err)
|
||||
pcig, err := h.deviceGetMaxPcieLinkGeneration()
|
||||
assert(err)
|
||||
pciw, err := h.deviceGetMaxPcieLinkWidth()
|
||||
assert(err)
|
||||
ccore, cmem, err := h.deviceGetMaxClockInfo()
|
||||
assert(err)
|
||||
|
||||
if minor == nil || busid == nil || uuid == nil {
|
||||
return nil, ErrUnsupportedGPU
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
||||
node, err := numaNode(*busid)
|
||||
assert(err)
|
||||
|
||||
device = &Device{
|
||||
handle: h,
|
||||
UUID: *uuid,
|
||||
Path: path,
|
||||
Model: model,
|
||||
Power: power,
|
||||
CPUAffinity: &node,
|
||||
PCI: PCIInfo{
|
||||
BusID: *busid,
|
||||
BAR1: bar1,
|
||||
Bandwidth: pciBandwidth(pcig, pciw), // MB/s
|
||||
},
|
||||
Clocks: ClockInfo{
|
||||
Cores: ccore, // MHz
|
||||
Memory: cmem, // MHz
|
||||
},
|
||||
}
|
||||
if power != nil {
|
||||
*device.Power /= 1000 // W
|
||||
}
|
||||
if bar1 != nil {
|
||||
*device.PCI.BAR1 /= 1024 * 1024 // MiB
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func NewDeviceLite(idx uint) (device *Device, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
h, err := deviceGetHandleByIndex(idx)
|
||||
assert(err)
|
||||
uuid, err := h.deviceGetUUID()
|
||||
assert(err)
|
||||
minor, err := h.deviceGetMinorNumber()
|
||||
assert(err)
|
||||
busid, err := h.deviceGetPciInfo()
|
||||
assert(err)
|
||||
|
||||
if minor == nil || busid == nil || uuid == nil {
|
||||
return nil, ErrUnsupportedGPU
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
||||
|
||||
device = &Device{
|
||||
handle: h,
|
||||
UUID: *uuid,
|
||||
Path: path,
|
||||
PCI: PCIInfo{
|
||||
BusID: *busid,
|
||||
},
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (d *Device) Status() (status *DeviceStatus, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
power, err := d.deviceGetPowerUsage()
|
||||
assert(err)
|
||||
temp, err := d.deviceGetTemperature()
|
||||
assert(err)
|
||||
ugpu, umem, err := d.deviceGetUtilizationRates()
|
||||
assert(err)
|
||||
uenc, err := d.deviceGetEncoderUtilization()
|
||||
assert(err)
|
||||
udec, err := d.deviceGetDecoderUtilization()
|
||||
assert(err)
|
||||
mem, err := d.deviceGetMemoryInfo()
|
||||
assert(err)
|
||||
ccore, cmem, err := d.deviceGetClockInfo()
|
||||
assert(err)
|
||||
_, bar1, err := d.deviceGetBAR1MemoryInfo()
|
||||
assert(err)
|
||||
pids, pmems, err := d.deviceGetComputeRunningProcesses()
|
||||
assert(err)
|
||||
el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
|
||||
assert(err)
|
||||
pcirx, pcitx, err := d.deviceGetPcieThroughput()
|
||||
assert(err)
|
||||
|
||||
status = &DeviceStatus{
|
||||
Power: power,
|
||||
Temperature: temp, // °C
|
||||
Utilization: UtilizationInfo{
|
||||
GPU: ugpu, // %
|
||||
Memory: umem, // %
|
||||
Encoder: uenc, // %
|
||||
Decoder: udec, // %
|
||||
},
|
||||
Memory: MemoryInfo{
|
||||
GlobalUsed: mem,
|
||||
ECCErrors: ECCErrorsInfo{
|
||||
L1Cache: el1,
|
||||
L2Cache: el2,
|
||||
Global: emem,
|
||||
},
|
||||
},
|
||||
Clocks: ClockInfo{
|
||||
Cores: ccore, // MHz
|
||||
Memory: cmem, // MHz
|
||||
},
|
||||
PCI: PCIStatusInfo{
|
||||
BAR1Used: bar1,
|
||||
Throughput: PCIThroughputInfo{
|
||||
RX: pcirx,
|
||||
TX: pcitx,
|
||||
},
|
||||
},
|
||||
}
|
||||
if power != nil {
|
||||
*status.Power /= 1000 // W
|
||||
}
|
||||
if mem != nil {
|
||||
*status.Memory.GlobalUsed /= 1024 * 1024 // MiB
|
||||
}
|
||||
if bar1 != nil {
|
||||
*status.PCI.BAR1Used /= 1024 * 1024 // MiB
|
||||
}
|
||||
if pcirx != nil {
|
||||
*status.PCI.Throughput.RX /= 1000 // MB/s
|
||||
}
|
||||
if pcitx != nil {
|
||||
*status.PCI.Throughput.TX /= 1000 // MB/s
|
||||
}
|
||||
for i := range pids {
|
||||
name, err := systemGetProcessName(pids[i])
|
||||
assert(err)
|
||||
status.Processes = append(status.Processes, ProcessInfo{
|
||||
PID: pids[i],
|
||||
Name: name,
|
||||
MemoryUsed: pmems[i] / (1024 * 1024), // MiB
|
||||
})
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
|
||||
level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
|
||||
if err != nil || level == nil {
|
||||
return P2PLinkUnknown, err
|
||||
}
|
||||
|
||||
switch *level {
|
||||
case C.NVML_TOPOLOGY_INTERNAL:
|
||||
link = P2PLinkSameBoard
|
||||
case C.NVML_TOPOLOGY_SINGLE:
|
||||
link = P2PLinkSingleSwitch
|
||||
case C.NVML_TOPOLOGY_MULTIPLE:
|
||||
link = P2PLinkMultiSwitch
|
||||
case C.NVML_TOPOLOGY_HOSTBRIDGE:
|
||||
link = P2PLinkHostBridge
|
||||
case C.NVML_TOPOLOGY_CPU:
|
||||
link = P2PLinkSameCPU
|
||||
case C.NVML_TOPOLOGY_SYSTEM:
|
||||
link = P2PLinkCrossCPU
|
||||
default:
|
||||
err = ErrUnsupportedP2PLink
|
||||
}
|
||||
return
|
||||
}
|
46
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/nvml_dl.c
generated
vendored
Normal file
46
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/nvml_dl.c
generated
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include "nvml_dl.h"
|
||||
|
||||
#define DLSYM(x, sym) \
|
||||
do { \
|
||||
dlerror(); \
|
||||
x = dlsym(handle, #sym); \
|
||||
if (dlerror() != NULL) { \
|
||||
return (NVML_ERROR_FUNCTION_NOT_FOUND); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
typedef nvmlReturn_t (*nvmlSym_t)();
|
||||
|
||||
static void *handle;
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlInit)(void)
|
||||
{
|
||||
handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
|
||||
if (handle == NULL) {
|
||||
return (NVML_ERROR_LIBRARY_NOT_FOUND);
|
||||
}
|
||||
return (nvmlInit());
|
||||
}
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlShutdown)(void)
|
||||
{
|
||||
nvmlReturn_t r = nvmlShutdown();
|
||||
if (r != NVML_SUCCESS) {
|
||||
return (r);
|
||||
}
|
||||
return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
|
||||
nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
|
||||
{
|
||||
nvmlSym_t sym;
|
||||
|
||||
DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
|
||||
return ((*sym)(dev1, dev2, info));
|
||||
}
|
15
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/nvml_dl.h
generated
vendored
Normal file
15
gpushare-device-plugin/vendor/github.com/NVIDIA/nvidia-docker/src/nvml/nvml_dl.h
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
#ifndef _NVML_DL_H_
|
||||
#define _NVML_DL_H_
|
||||
|
||||
#include <nvml.h>
|
||||
|
||||
#define NVML_DL(x) x##_dl
|
||||
|
||||
extern nvmlReturn_t NVML_DL(nvmlInit)(void);
|
||||
extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
|
||||
extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
|
||||
nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
|
||||
|
||||
#endif // _NVML_DL_H_
|
Reference in New Issue
Block a user