synchronization

2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions
--- a/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
+++ b/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
+++ b/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
@@ -0,0 +1,634 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+package nvml
+
+// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
+// #include "nvml_dl.h"
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+const (
+	szDriver   = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
+	szName     = C.NVML_DEVICE_NAME_BUFFER_SIZE
+	szUUID     = C.NVML_DEVICE_UUID_BUFFER_SIZE
+	szProcs    = 32
+	szProcName = 64
+
+	XidCriticalError = C.nvmlEventTypeXidCriticalError
+)
+
+type handle struct{ dev C.nvmlDevice_t }
+type EventSet struct{ set C.nvmlEventSet_t }
+type Event struct {
+	UUID  *string
+	Etype uint64
+	Edata uint64
+}
+
+func uintPtr(c C.uint) *uint {
+	i := uint(c)
+	return &i
+}
+
+func uint64Ptr(c C.ulonglong) *uint64 {
+	i := uint64(c)
+	return &i
+}
+
+func stringPtr(c *C.char) *string {
+	s := C.GoString(c)
+	return &s
+}
+
+func errorString(ret C.nvmlReturn_t) error {
+	if ret == C.NVML_SUCCESS {
+		return nil
+	}
+	err := C.GoString(C.nvmlErrorString(ret))
+	return fmt.Errorf("nvml: %v", err)
+}
+
+func init_() error {
+	r := C.nvmlInit_dl()
+	if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
+		return errors.New("could not load NVML library")
+	}
+	return errorString(r)
+}
+
+func NewEventSet() EventSet {
+	var set C.nvmlEventSet_t
+	C.nvmlEventSetCreate(&set)
+
+	return EventSet{set}
+}
+
+func RegisterEvent(es EventSet, event int) error {
+	n, err := deviceGetCount()
+	if err != nil {
+		return err
+	}
+
+	var i uint
+	for i = 0; i < n; i++ {
+		h, err := deviceGetHandleByIndex(i)
+		if err != nil {
+			return err
+		}
+
+		r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
+		if r != C.NVML_SUCCESS {
+			return errorString(r)
+		}
+	}
+
+	return nil
+}
+
+func RegisterEventForDevice(es EventSet, event int, uuid string) error {
+	n, err := deviceGetCount()
+	if err != nil {
+		return err
+	}
+
+	var i uint
+	for i = 0; i < n; i++ {
+		h, err := deviceGetHandleByIndex(i)
+		if err != nil {
+			return err
+		}
+
+		duuid, err := h.deviceGetUUID()
+		if err != nil {
+			return err
+		}
+
+		if *duuid != uuid {
+			continue
+		}
+
+		r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
+		if r != C.NVML_SUCCESS {
+			return errorString(r)
+		}
+
+		return nil
+	}
+
+	return fmt.Errorf("nvml: device not found")
+}
+
+func DeleteEventSet(es EventSet) {
+	C.nvmlEventSetFree(es.set)
+}
+
+func WaitForEvent(es EventSet, timeout uint) (Event, error) {
+	var data C.nvmlEventData_t
+
+	r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
+	uuid, _ := handle{data.device}.deviceGetUUID()
+
+	return Event{
+			UUID:  uuid,
+			Etype: uint64(data.eventType),
+			Edata: uint64(data.eventData),
+		},
+		errorString(r)
+}
+
+func shutdown() error {
+	return errorString(C.nvmlShutdown_dl())
+}
+
+func systemGetDriverVersion() (string, error) {
+	var driver [szDriver]C.char
+
+	r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
+	return C.GoString(&driver[0]), errorString(r)
+}
+
+func systemGetProcessName(pid uint) (string, error) {
+	var proc [szProcName]C.char
+
+	r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
+	return C.GoString(&proc[0]), errorString(r)
+}
+
+func deviceGetCount() (uint, error) {
+	var n C.uint
+
+	r := C.nvmlDeviceGetCount(&n)
+	return uint(n), errorString(r)
+}
+
+func deviceGetHandleByIndex(idx uint) (handle, error) {
+	var dev C.nvmlDevice_t
+
+	r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
+	return handle{dev}, errorString(r)
+}
+
+func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
+	var level C.nvmlGpuTopologyLevel_t
+
+	r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
+	if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(C.uint(level)), errorString(r)
+}
+
+func (h handle) deviceGetName() (*string, error) {
+	var name [szName]C.char
+
+	r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return stringPtr(&name[0]), errorString(r)
+}
+
+func (h handle) deviceGetUUID() (*string, error) {
+	var uuid [szUUID]C.char
+
+	r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return stringPtr(&uuid[0]), errorString(r)
+}
+
+func (h handle) deviceGetPciInfo() (*string, error) {
+	var pci C.nvmlPciInfo_t
+
+	r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return stringPtr(&pci.busId[0]), errorString(r)
+}
+
+func (h handle) deviceGetMinorNumber() (*uint, error) {
+	var minor C.uint
+
+	r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(minor), errorString(r)
+}
+
+func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
+	var bar1 C.nvmlBAR1Memory_t
+
+	r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
+}
+
+func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
+	var power C.uint
+
+	r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(power), errorString(r)
+}
+
+func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
+	var sm, mem C.uint
+
+	r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
+	}
+	return uintPtr(sm), uintPtr(mem), errorString(r)
+}
+
+func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
+	var link C.uint
+
+	r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(link), errorString(r)
+}
+
+func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
+	var width C.uint
+
+	r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(width), errorString(r)
+}
+
+func (h handle) deviceGetPowerUsage() (*uint, error) {
+	var power C.uint
+
+	r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(power), errorString(r)
+}
+
+func (h handle) deviceGetTemperature() (*uint, error) {
+	var temp C.uint
+
+	r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(temp), errorString(r)
+}
+
+func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
+	var usage C.nvmlUtilization_t
+
+	r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
+}
+
+func (h handle) deviceGetEncoderUtilization() (*uint, error) {
+	var usage, sampling C.uint
+
+	r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(usage), errorString(r)
+}
+
+func (h handle) deviceGetDecoderUtilization() (*uint, error) {
+	var usage, sampling C.uint
+
+	r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(usage), errorString(r)
+}
+
+func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
+	var mem C.nvmlMemory_t
+
+	r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	err = errorString(r)
+	if r != C.NVML_SUCCESS {
+		return
+	}
+
+	totalMem = uint64Ptr(mem.total)
+	if totalMem != nil {
+		*totalMem /= 1024 * 1024 // MiB
+	}
+
+	devMem = DeviceMemory{
+		Used: uint64Ptr(mem.used),
+		Free: uint64Ptr(mem.free),
+	}
+
+	if devMem.Used != nil {
+		*devMem.Used /= 1024 * 1024 // MiB
+	}
+
+	if devMem.Free != nil {
+		*devMem.Free /= 1024 * 1024 // MiB
+	}
+	return
+}
+
+func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
+	var sm, mem C.uint
+
+	r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
+	}
+	return uintPtr(sm), uintPtr(mem), errorString(r)
+}
+
+func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
+	var l1, l2, mem C.ulonglong
+
+	r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+		C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+			C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+			C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
+	}
+	return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
+}
+
+func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
+	var rx, tx C.uint
+
+	r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
+	}
+	return uintPtr(rx), uintPtr(tx), errorString(r)
+}
+
+func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
+	var procs [szProcs]C.nvmlProcessInfo_t
+	var count = C.uint(szProcs)
+
+	r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	n := int(count)
+	pids := make([]uint, n)
+	mems := make([]uint64, n)
+	for i := 0; i < n; i++ {
+		pids[i] = uint(procs[i].pid)
+		mems[i] = uint64(procs[i].usedGpuMemory)
+	}
+	return pids, mems, errorString(r)
+}
+
+func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
+	var procs [szProcs]C.nvmlProcessInfo_t
+	var count = C.uint(szProcs)
+
+	r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	n := int(count)
+	pids := make([]uint, n)
+	mems := make([]uint64, n)
+	for i := 0; i < n; i++ {
+		pids[i] = uint(procs[i].pid)
+		mems[i] = uint64(procs[i].usedGpuMemory)
+	}
+	return pids, mems, errorString(r)
+}
+
+func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
+	cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
+	if err != nil {
+		return nil, err
+	}
+
+	gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
+	if err != nil {
+		return nil, err
+	}
+
+	allPids := make(map[uint]ProcessInfo)
+
+	for i, pid := range cPids {
+		name, err := processName(pid)
+		if err != nil {
+			return nil, err
+		}
+		allPids[pid] = ProcessInfo{
+			PID:        pid,
+			Name:       name,
+			MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
+			Type:       Compute,
+		}
+
+	}
+
+	for i, pid := range gPids {
+		pInfo, exists := allPids[pid]
+		if exists {
+			pInfo.Type = ComputeAndGraphics
+			allPids[pid] = pInfo
+		} else {
+			name, err := processName(pid)
+			if err != nil {
+				return nil, err
+			}
+			allPids[pid] = ProcessInfo{
+				PID:        pid,
+				Name:       name,
+				MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
+				Type:       Graphics,
+			}
+		}
+	}
+
+	var processInfo []ProcessInfo
+	for _, v := range allPids {
+		processInfo = append(processInfo, v)
+	}
+	sort.Slice(processInfo, func(i, j int) bool {
+		return processInfo[i].PID < processInfo[j].PID
+	})
+
+	return processInfo, nil
+}
+
+func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
+	var clocksThrottleReasons C.ulonglong
+
+	r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
+
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return ThrottleReasonUnknown, nil
+	}
+
+	if r != C.NVML_SUCCESS {
+		return ThrottleReasonUnknown, errorString(r)
+	}
+
+	switch clocksThrottleReasons {
+	case C.nvmlClocksThrottleReasonGpuIdle:
+		reason = ThrottleReasonGpuIdle
+	case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
+		reason = ThrottleReasonApplicationsClocksSetting
+	case C.nvmlClocksThrottleReasonSwPowerCap:
+		reason = ThrottleReasonSwPowerCap
+	case C.nvmlClocksThrottleReasonHwSlowdown:
+		reason = ThrottleReasonHwSlowdown
+	case C.nvmlClocksThrottleReasonSyncBoost:
+		reason = ThrottleReasonSyncBoost
+	case C.nvmlClocksThrottleReasonSwThermalSlowdown:
+		reason = ThrottleReasonSwThermalSlowdown
+	case C.nvmlClocksThrottleReasonHwThermalSlowdown:
+		reason = ThrottleReasonHwThermalSlowdown
+	case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
+		reason = ThrottleReasonHwPowerBrakeSlowdown
+	case C.nvmlClocksThrottleReasonDisplayClockSetting:
+		reason = ThrottleReasonDisplayClockSetting
+	case C.nvmlClocksThrottleReasonNone:
+		reason = ThrottleReasonNone
+	}
+	return
+}
+
+func (h handle) getPerformanceState() (PerfState, error) {
+	var pstate C.nvmlPstates_t
+
+	r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
+
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return PerfStateUnknown, nil
+	}
+
+	if r != C.NVML_SUCCESS {
+		return PerfStateUnknown, errorString(r)
+	}
+	return PerfState(pstate), nil
+}
+
+func processName(pid uint) (string, error) {
+	f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
+	d, err := ioutil.ReadFile(f)
+
+	if err != nil {
+		// TOCTOU: process terminated
+		if os.IsNotExist(err) {
+			return "", nil
+		}
+		return "", err
+	}
+	return strings.TrimSuffix(string(d), "\n"), err
+}
+
+func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
+	var mode C.nvmlEnableState_t
+	var buffer C.uint
+
+	r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	if r != C.NVML_SUCCESS {
+		return accountingInfo, errorString(r)
+	}
+
+	r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	if r != C.NVML_SUCCESS {
+		return accountingInfo, errorString(r)
+	}
+
+	accountingInfo = Accounting{
+		Mode:       ModeState(mode),
+		BufferSize: uintPtr(buffer),
+	}
+	return
+}
+
+func (h handle) getDisplayInfo() (display Display, err error) {
+	var mode, isActive C.nvmlEnableState_t
+
+	r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	if r != C.NVML_SUCCESS {
+		return display, errorString(r)
+	}
+
+	r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+	if r != C.NVML_SUCCESS {
+		return display, errorString(r)
+	}
+	display = Display{
+		Mode:   ModeState(mode),
+		Active: ModeState(isActive),
+	}
+	return
+}
+
+func (h handle) getPeristenceMode() (state ModeState, err error) {
+	var mode C.nvmlEnableState_t
+
+	r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+	return ModeState(mode), errorString(r)
+}
--- a/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
+++ b/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
@@ -0,0 +1,533 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+package nvml
+
+// #include "nvml_dl.h"
+import "C"
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrCPUAffinity        = errors.New("failed to retrieve CPU affinity")
+	ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
+	ErrUnsupportedGPU     = errors.New("unsupported GPU device")
+)
+
+type ModeState uint
+
+const (
+	Enabled ModeState = iota
+	Disabled
+)
+
+func (m ModeState) String() string {
+	switch m {
+	case Enabled:
+		return "Enabled"
+	case Disabled:
+		return "Disabled"
+	}
+	return "N/A"
+}
+
+type Display struct {
+	Mode   ModeState
+	Active ModeState
+}
+
+type Accounting struct {
+	Mode       ModeState
+	BufferSize *uint
+}
+
+type DeviceMode struct {
+	DisplayInfo    Display
+	Persistence    ModeState
+	AccountingInfo Accounting
+}
+
+type ThrottleReason uint
+
+const (
+	ThrottleReasonGpuIdle ThrottleReason = iota
+	ThrottleReasonApplicationsClocksSetting
+	ThrottleReasonSwPowerCap
+	ThrottleReasonHwSlowdown
+	ThrottleReasonSyncBoost
+	ThrottleReasonSwThermalSlowdown
+	ThrottleReasonHwThermalSlowdown
+	ThrottleReasonHwPowerBrakeSlowdown
+	ThrottleReasonDisplayClockSetting
+	ThrottleReasonNone
+	ThrottleReasonUnknown
+)
+
+func (r ThrottleReason) String() string {
+	switch r {
+	case ThrottleReasonGpuIdle:
+		return "Gpu Idle"
+	case ThrottleReasonApplicationsClocksSetting:
+		return "Applications Clocks Setting"
+	case ThrottleReasonSwPowerCap:
+		return "SW Power Cap"
+	case ThrottleReasonHwSlowdown:
+		return "HW Slowdown"
+	case ThrottleReasonSyncBoost:
+		return "Sync Boost"
+	case ThrottleReasonSwThermalSlowdown:
+		return "SW Thermal Slowdown"
+	case ThrottleReasonHwThermalSlowdown:
+		return "HW Thermal Slowdown"
+	case ThrottleReasonHwPowerBrakeSlowdown:
+		return "HW Power Brake Slowdown"
+	case ThrottleReasonDisplayClockSetting:
+		return "Display Clock Setting"
+	case ThrottleReasonNone:
+		return "No clocks throttling"
+	}
+	return "N/A"
+}
+
+type PerfState uint
+
+const (
+	PerfStateMax     = 0
+	PerfStateMin     = 15
+	PerfStateUnknown = 32
+)
+
+func (p PerfState) String() string {
+	if p >= PerfStateMax && p <= PerfStateMin {
+		return fmt.Sprintf("P%d", p)
+	}
+	return "Unknown"
+}
+
+type ProcessType uint
+
+const (
+	Compute ProcessType = iota
+	Graphics
+	ComputeAndGraphics
+)
+
+func (t ProcessType) String() string {
+	typ := "C+G"
+	if t == Compute {
+		typ = "C"
+	} else if t == Graphics {
+		typ = "G"
+	}
+	return typ
+}
+
+type P2PLinkType uint
+
+const (
+	P2PLinkUnknown P2PLinkType = iota
+	P2PLinkCrossCPU
+	P2PLinkSameCPU
+	P2PLinkHostBridge
+	P2PLinkMultiSwitch
+	P2PLinkSingleSwitch
+	P2PLinkSameBoard
+)
+
+type P2PLink struct {
+	BusID string
+	Link  P2PLinkType
+}
+
+func (t P2PLinkType) String() string {
+	switch t {
+	case P2PLinkCrossCPU:
+		return "Cross CPU socket"
+	case P2PLinkSameCPU:
+		return "Same CPU socket"
+	case P2PLinkHostBridge:
+		return "Host PCI bridge"
+	case P2PLinkMultiSwitch:
+		return "Multiple PCI switches"
+	case P2PLinkSingleSwitch:
+		return "Single PCI switch"
+	case P2PLinkSameBoard:
+		return "Same board"
+	case P2PLinkUnknown:
+	}
+	return "N/A"
+}
+
+type ClockInfo struct {
+	Cores  *uint
+	Memory *uint
+}
+
+type PCIInfo struct {
+	BusID     string
+	BAR1      *uint64
+	Bandwidth *uint
+}
+
+type Device struct {
+	handle
+
+	UUID        string
+	Path        string
+	Model       *string
+	Power       *uint
+	Memory      *uint64
+	CPUAffinity *uint
+	PCI         PCIInfo
+	Clocks      ClockInfo
+	Topology    []P2PLink
+}
+
+type UtilizationInfo struct {
+	GPU     *uint
+	Memory  *uint
+	Encoder *uint
+	Decoder *uint
+}
+
+type PCIThroughputInfo struct {
+	RX *uint
+	TX *uint
+}
+
+type PCIStatusInfo struct {
+	BAR1Used   *uint64
+	Throughput PCIThroughputInfo
+}
+
+type ECCErrorsInfo struct {
+	L1Cache *uint64
+	L2Cache *uint64
+	Device  *uint64
+}
+
+type DeviceMemory struct {
+	Used *uint64
+	Free *uint64
+}
+
+type MemoryInfo struct {
+	Global    DeviceMemory
+	ECCErrors ECCErrorsInfo
+}
+
+type ProcessInfo struct {
+	PID        uint
+	Name       string
+	MemoryUsed uint64
+	Type       ProcessType
+}
+
+type DeviceStatus struct {
+	Power       *uint
+	Temperature *uint
+	Utilization UtilizationInfo
+	Memory      MemoryInfo
+	Clocks      ClockInfo
+	PCI         PCIStatusInfo
+	Processes   []ProcessInfo
+	Throttle    ThrottleReason
+	Performance PerfState
+}
+
+func assert(err error) {
+	if err != nil {
+		panic(err)
+	}
+}
+
+func Init() error {
+	return init_()
+}
+
+func Shutdown() error {
+	return shutdown()
+}
+
+func GetDeviceCount() (uint, error) {
+	return deviceGetCount()
+}
+
+func GetDriverVersion() (string, error) {
+	return systemGetDriverVersion()
+}
+
+func numaNode(busid string) (uint, error) {
+	// discard leading zeros of busid
+	b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
+	if err != nil {
+		// XXX report node 0 if NUMA support isn't enabled
+		return 0, nil
+	}
+	node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
+	if err != nil {
+		return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
+	}
+	if node < 0 {
+		node = 0 // XXX report node 0 instead of NUMA_NO_NODE
+	}
+	return uint(node), nil
+}
+
+func pciBandwidth(gen, width *uint) *uint {
+	m := map[uint]uint{
+		1: 250, // MB/s
+		2: 500,
+		3: 985,
+		4: 1969,
+	}
+	if gen == nil || width == nil {
+		return nil
+	}
+	bw := m[*gen] * *width
+	return &bw
+}
+
+func NewDevice(idx uint) (device *Device, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	h, err := deviceGetHandleByIndex(idx)
+	assert(err)
+	model, err := h.deviceGetName()
+	assert(err)
+	uuid, err := h.deviceGetUUID()
+	assert(err)
+	minor, err := h.deviceGetMinorNumber()
+	assert(err)
+	power, err := h.deviceGetPowerManagementLimit()
+	assert(err)
+	totalMem, _, err := h.deviceGetMemoryInfo()
+	assert(err)
+	busid, err := h.deviceGetPciInfo()
+	assert(err)
+	bar1, _, err := h.deviceGetBAR1MemoryInfo()
+	assert(err)
+	pcig, err := h.deviceGetMaxPcieLinkGeneration()
+	assert(err)
+	pciw, err := h.deviceGetMaxPcieLinkWidth()
+	assert(err)
+	ccore, cmem, err := h.deviceGetMaxClockInfo()
+	assert(err)
+
+	if minor == nil || busid == nil || uuid == nil {
+		return nil, ErrUnsupportedGPU
+	}
+	path := fmt.Sprintf("/dev/nvidia%d", *minor)
+	node, err := numaNode(*busid)
+	assert(err)
+
+	device = &Device{
+		handle:      h,
+		UUID:        *uuid,
+		Path:        path,
+		Model:       model,
+		Power:       power,
+		Memory:      totalMem,
+		CPUAffinity: &node,
+		PCI: PCIInfo{
+			BusID:     *busid,
+			BAR1:      bar1,
+			Bandwidth: pciBandwidth(pcig, pciw), // MB/s
+		},
+		Clocks: ClockInfo{
+			Cores:  ccore, // MHz
+			Memory: cmem,  // MHz
+		},
+	}
+	if power != nil {
+		*device.Power /= 1000 // W
+	}
+	if bar1 != nil {
+		*device.PCI.BAR1 /= 1024 * 1024 // MiB
+	}
+	return
+}
+
+func NewDeviceLite(idx uint) (device *Device, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	h, err := deviceGetHandleByIndex(idx)
+	assert(err)
+	uuid, err := h.deviceGetUUID()
+	assert(err)
+	minor, err := h.deviceGetMinorNumber()
+	assert(err)
+	busid, err := h.deviceGetPciInfo()
+	assert(err)
+
+	if minor == nil || busid == nil || uuid == nil {
+		return nil, ErrUnsupportedGPU
+	}
+	path := fmt.Sprintf("/dev/nvidia%d", *minor)
+
+	device = &Device{
+		handle: h,
+		UUID:   *uuid,
+		Path:   path,
+		PCI: PCIInfo{
+			BusID: *busid,
+		},
+	}
+	return
+}
+
+func (d *Device) Status() (status *DeviceStatus, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	power, err := d.deviceGetPowerUsage()
+	assert(err)
+	temp, err := d.deviceGetTemperature()
+	assert(err)
+	ugpu, umem, err := d.deviceGetUtilizationRates()
+	assert(err)
+	uenc, err := d.deviceGetEncoderUtilization()
+	assert(err)
+	udec, err := d.deviceGetDecoderUtilization()
+	assert(err)
+	_, devMem, err := d.deviceGetMemoryInfo()
+	assert(err)
+	ccore, cmem, err := d.deviceGetClockInfo()
+	assert(err)
+	_, bar1, err := d.deviceGetBAR1MemoryInfo()
+	assert(err)
+	el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
+	assert(err)
+	pcirx, pcitx, err := d.deviceGetPcieThroughput()
+	assert(err)
+	throttle, err := d.getClocksThrottleReasons()
+	assert(err)
+	perfState, err := d.getPerformanceState()
+	assert(err)
+	processInfo, err := d.deviceGetAllRunningProcesses()
+	assert(err)
+
+	status = &DeviceStatus{
+		Power:       power,
+		Temperature: temp, // °C
+		Utilization: UtilizationInfo{
+			GPU:     ugpu, // %
+			Memory:  umem, // %
+			Encoder: uenc, // %
+			Decoder: udec, // %
+		},
+		Memory: MemoryInfo{
+			Global: devMem,
+			ECCErrors: ECCErrorsInfo{
+				L1Cache: el1,
+				L2Cache: el2,
+				Device:  emem,
+			},
+		},
+		Clocks: ClockInfo{
+			Cores:  ccore, // MHz
+			Memory: cmem,  // MHz
+		},
+		PCI: PCIStatusInfo{
+			BAR1Used: bar1,
+			Throughput: PCIThroughputInfo{
+				RX: pcirx,
+				TX: pcitx,
+			},
+		},
+		Throttle:    throttle,
+		Performance: perfState,
+		Processes:   processInfo,
+	}
+	if power != nil {
+		*status.Power /= 1000 // W
+	}
+	if bar1 != nil {
+		*status.PCI.BAR1Used /= 1024 * 1024 // MiB
+	}
+	if pcirx != nil {
+		*status.PCI.Throughput.RX /= 1000 // MB/s
+	}
+	if pcitx != nil {
+		*status.PCI.Throughput.TX /= 1000 // MB/s
+	}
+	return
+}
+
+func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
+	level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
+	if err != nil || level == nil {
+		return P2PLinkUnknown, err
+	}
+
+	switch *level {
+	case C.NVML_TOPOLOGY_INTERNAL:
+		link = P2PLinkSameBoard
+	case C.NVML_TOPOLOGY_SINGLE:
+		link = P2PLinkSingleSwitch
+	case C.NVML_TOPOLOGY_MULTIPLE:
+		link = P2PLinkMultiSwitch
+	case C.NVML_TOPOLOGY_HOSTBRIDGE:
+		link = P2PLinkHostBridge
+	case C.NVML_TOPOLOGY_CPU:
+		link = P2PLinkSameCPU
+	case C.NVML_TOPOLOGY_SYSTEM:
+		link = P2PLinkCrossCPU
+	default:
+		err = ErrUnsupportedP2PLink
+	}
+	return
+}
+
+func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
+	return d.handle.deviceGetComputeRunningProcesses()
+}
+
+func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
+	return d.handle.deviceGetGraphicsRunningProcesses()
+}
+
+func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
+	return d.handle.deviceGetAllRunningProcesses()
+}
+
+func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	display, err := d.getDisplayInfo()
+	assert(err)
+
+	p, err := d.getPeristenceMode()
+	assert(err)
+
+	accounting, err := d.getAccountingInfo()
+	assert(err)
+
+	mode = &DeviceMode{
+		DisplayInfo:    display,
+		Persistence:    p,
+		AccountingInfo: accounting,
+	}
+	return
+}
--- a/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
+++ b/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
--- a/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
+++ b/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
@@ -0,0 +1,46 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+#include "nvml_dl.h"
+
+#define DLSYM(x, sym)                           \
+do {                                            \
+    dlerror();				        \
+    x = dlsym(handle, #sym);                    \
+    if (dlerror() != NULL) {                    \
+        return (NVML_ERROR_FUNCTION_NOT_FOUND); \
+    }                                           \
+} while (0)
+
+typedef nvmlReturn_t (*nvmlSym_t)();
+
+static void *handle;
+
+nvmlReturn_t NVML_DL(nvmlInit)(void)
+{
+    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
+    if (handle == NULL) {
+	return (NVML_ERROR_LIBRARY_NOT_FOUND);
+    }
+    return (nvmlInit());
+}
+
+nvmlReturn_t NVML_DL(nvmlShutdown)(void)
+{
+    nvmlReturn_t r = nvmlShutdown();
+    if (r != NVML_SUCCESS) {
+	return (r);
+    }
+    return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
+}
+
+nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
+  nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
+{
+    nvmlSym_t sym;
+
+    DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
+    return ((*sym)(dev1, dev2, info));
+}
--- a/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
+++ b/gpushare-device-plugin/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+#ifndef _NVML_DL_H_
+#define _NVML_DL_H_
+
+#include "nvml.h"
+
+#define NVML_DL(x) x##_dl
+
+extern nvmlReturn_t NVML_DL(nvmlInit)(void);
+extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
+extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
+  nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
+
+#endif // _NVML_DL_H_