synchronization
This commit is contained in:
		
							
								
								
									
										255
									
								
								gpushare-device-plugin/cmd/inspect/display.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										255
									
								
								gpushare-device-plugin/cmd/inspect/display.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,255 @@
 | 
			
		||||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"os"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"text/tabwriter"
 | 
			
		||||
 | 
			
		||||
	log "github.com/golang/glog"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func displayDetails(nodeInfos []*NodeInfo) {
 | 
			
		||||
	w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
 | 
			
		||||
	var (
 | 
			
		||||
		totalGPUMemInCluster int64
 | 
			
		||||
		usedGPUMemInCluster  int64
 | 
			
		||||
		prtLineLen           int
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	for _, nodeInfo := range nodeInfos {
 | 
			
		||||
		address := "unknown"
 | 
			
		||||
		if len(nodeInfo.node.Status.Addresses) > 0 {
 | 
			
		||||
			//address = nodeInfo.node.Status.Addresses[0].Address
 | 
			
		||||
			for _, addr := range nodeInfo.node.Status.Addresses {
 | 
			
		||||
				if addr.Type == v1.NodeInternalIP {
 | 
			
		||||
					address = addr.Address
 | 
			
		||||
					break
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		totalGPUMemInNode := nodeInfo.gpuTotalMemory
 | 
			
		||||
		if totalGPUMemInNode <= 0 {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fmt.Fprintf(w, "\n")
 | 
			
		||||
		fmt.Fprintf(w, "NAME:\t%s\n", nodeInfo.node.Name)
 | 
			
		||||
		fmt.Fprintf(w, "IPADDRESS:\t%s\n", address)
 | 
			
		||||
		fmt.Fprintf(w, "\n")
 | 
			
		||||
 | 
			
		||||
		usedGPUMemInNode := 0
 | 
			
		||||
		var buf bytes.Buffer
 | 
			
		||||
		buf.WriteString("NAME\tNAMESPACE\t")
 | 
			
		||||
		for i := 0; i < nodeInfo.gpuCount; i++ {
 | 
			
		||||
			buf.WriteString(fmt.Sprintf("GPU%d(Allocated)\t", i))
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if nodeInfo.hasPendingGPUMemory() {
 | 
			
		||||
			buf.WriteString("Pending(Allocated)\t")
 | 
			
		||||
		}
 | 
			
		||||
		buf.WriteString("\n")
 | 
			
		||||
		fmt.Fprintf(w, buf.String())
 | 
			
		||||
 | 
			
		||||
		var buffer bytes.Buffer
 | 
			
		||||
		exists := map[types.UID]bool{}
 | 
			
		||||
		for i, dev := range nodeInfo.devs {
 | 
			
		||||
			usedGPUMemInNode += dev.usedGPUMem
 | 
			
		||||
			for _, pod := range dev.pods {
 | 
			
		||||
				if _,ok := exists[pod.UID]; ok {
 | 
			
		||||
					continue 
 | 
			
		||||
				}
 | 
			
		||||
				buffer.WriteString(fmt.Sprintf("%s\t%s\t", pod.Name, pod.Namespace))
 | 
			
		||||
				count := nodeInfo.gpuCount
 | 
			
		||||
				if nodeInfo.hasPendingGPUMemory() {
 | 
			
		||||
					count += 1
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
				for k := 0; k < count; k++ {
 | 
			
		||||
					allocation := GetAllocation(&pod) 
 | 
			
		||||
					if len(allocation) != 0 {
 | 
			
		||||
						buffer.WriteString(fmt.Sprintf("%d\t", allocation[k]))
 | 
			
		||||
						continue 
 | 
			
		||||
					}
 | 
			
		||||
					if k == i || (i == -1 && k == nodeInfo.gpuCount) {
 | 
			
		||||
						buffer.WriteString(fmt.Sprintf("%d\t", getGPUMemoryInPod(pod)))
 | 
			
		||||
					} else {
 | 
			
		||||
						buffer.WriteString("0\t")
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
				buffer.WriteString("\n")
 | 
			
		||||
				exists[pod.UID] = true 
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if prtLineLen == 0 {
 | 
			
		||||
			prtLineLen = buffer.Len() + 10
 | 
			
		||||
		}
 | 
			
		||||
		fmt.Fprintf(w, buffer.String())
 | 
			
		||||
 | 
			
		||||
		var gpuUsageInNode float64 = 0
 | 
			
		||||
		if totalGPUMemInNode > 0 {
 | 
			
		||||
			gpuUsageInNode = float64(usedGPUMemInNode) / float64(totalGPUMemInNode) * 100
 | 
			
		||||
		} else {
 | 
			
		||||
			fmt.Fprintf(w, "\n")
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fmt.Fprintf(w, "Allocated :\t%d (%d%%)\t\n", usedGPUMemInNode, int64(gpuUsageInNode))
 | 
			
		||||
		fmt.Fprintf(w, "Total :\t%d \t\n", nodeInfo.gpuTotalMemory)
 | 
			
		||||
		// fmt.Fprintf(w, "-----------------------------------------------------------------------------------------\n")
 | 
			
		||||
		var prtLine bytes.Buffer
 | 
			
		||||
		for i := 0; i < prtLineLen; i++ {
 | 
			
		||||
			prtLine.WriteString("-")
 | 
			
		||||
		}
 | 
			
		||||
		prtLine.WriteString("\n")
 | 
			
		||||
		fmt.Fprintf(w, prtLine.String())
 | 
			
		||||
		totalGPUMemInCluster += int64(totalGPUMemInNode)
 | 
			
		||||
		usedGPUMemInCluster += int64(usedGPUMemInNode)
 | 
			
		||||
	}
 | 
			
		||||
	fmt.Fprintf(w, "\n")
 | 
			
		||||
	fmt.Fprintf(w, "\n")
 | 
			
		||||
	fmt.Fprintf(w, "Allocated/Total GPU Memory In Cluster:\t")
 | 
			
		||||
	log.V(2).Infof("gpu: %s, allocated GPU Memory %s", strconv.FormatInt(totalGPUMemInCluster, 10),
 | 
			
		||||
		strconv.FormatInt(usedGPUMemInCluster, 10))
 | 
			
		||||
 | 
			
		||||
	var gpuUsage float64 = 0
 | 
			
		||||
	if totalGPUMemInCluster > 0 {
 | 
			
		||||
		gpuUsage = float64(usedGPUMemInCluster) / float64(totalGPUMemInCluster) * 100
 | 
			
		||||
	}
 | 
			
		||||
	fmt.Fprintf(w, "%s/%s (%d%%)\t\n",
 | 
			
		||||
		strconv.FormatInt(usedGPUMemInCluster, 10),
 | 
			
		||||
		strconv.FormatInt(totalGPUMemInCluster, 10),
 | 
			
		||||
		int64(gpuUsage))
 | 
			
		||||
	// fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", ...)
 | 
			
		||||
 | 
			
		||||
	_ = w.Flush()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getMaxGPUCount(nodeInfos []*NodeInfo) (max int) {
 | 
			
		||||
	for _, node := range nodeInfos {
 | 
			
		||||
		if node.gpuCount > max {
 | 
			
		||||
			max = node.gpuCount
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return max
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func displaySummary(nodeInfos []*NodeInfo) {
 | 
			
		||||
	w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
 | 
			
		||||
	var (
 | 
			
		||||
		maxGPUCount          int
 | 
			
		||||
		totalGPUMemInCluster int64
 | 
			
		||||
		usedGPUMemInCluster  int64
 | 
			
		||||
		prtLineLen           int
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	hasPendingGPU := hasPendingGPUMemory(nodeInfos)
 | 
			
		||||
 | 
			
		||||
	maxGPUCount = getMaxGPUCount(nodeInfos)
 | 
			
		||||
 | 
			
		||||
	var buffer bytes.Buffer
 | 
			
		||||
	buffer.WriteString("NAME\tIPADDRESS\t")
 | 
			
		||||
	for i := 0; i < maxGPUCount; i++ {
 | 
			
		||||
		buffer.WriteString(fmt.Sprintf("GPU%d(Allocated/Total)\t", i))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if hasPendingGPU {
 | 
			
		||||
		buffer.WriteString("PENDING(Allocated)\t")
 | 
			
		||||
	}
 | 
			
		||||
	buffer.WriteString(fmt.Sprintf("GPU Memory(%s)\n", memoryUnit))
 | 
			
		||||
 | 
			
		||||
	// fmt.Fprintf(w, "NAME\tIPADDRESS\tROLE\tGPU(Allocated/Total)\tPENDING(Allocated)\n")
 | 
			
		||||
	fmt.Fprintf(w, buffer.String())
 | 
			
		||||
	for _, nodeInfo := range nodeInfos {
 | 
			
		||||
		address := "unknown"
 | 
			
		||||
		if len(nodeInfo.node.Status.Addresses) > 0 {
 | 
			
		||||
			// address = nodeInfo.node.Status.Addresses[0].Address
 | 
			
		||||
			for _, addr := range nodeInfo.node.Status.Addresses {
 | 
			
		||||
				if addr.Type == v1.NodeInternalIP {
 | 
			
		||||
					address = addr.Address
 | 
			
		||||
					break
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		gpuMemInfos := []string{}
 | 
			
		||||
		pendingGPUMemInfo := ""
 | 
			
		||||
		usedGPUMemInNode := 0
 | 
			
		||||
		totalGPUMemInNode := nodeInfo.gpuTotalMemory
 | 
			
		||||
		if totalGPUMemInNode <= 0 {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for i := 0; i < maxGPUCount; i++ {
 | 
			
		||||
			gpuMemInfo := "0/0"
 | 
			
		||||
			if dev, ok := nodeInfo.devs[i]; ok {
 | 
			
		||||
				gpuMemInfo = dev.String()
 | 
			
		||||
				usedGPUMemInNode += dev.usedGPUMem
 | 
			
		||||
			}
 | 
			
		||||
			gpuMemInfos = append(gpuMemInfos, gpuMemInfo)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		// check if there is pending dev
 | 
			
		||||
		if dev, ok := nodeInfo.devs[-1]; ok {
 | 
			
		||||
			pendingGPUMemInfo = fmt.Sprintf("%d", dev.usedGPUMem)
 | 
			
		||||
			usedGPUMemInNode += dev.usedGPUMem
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		nodeGPUMemInfo := fmt.Sprintf("%d/%d", usedGPUMemInNode, totalGPUMemInNode)
 | 
			
		||||
 | 
			
		||||
		var buf bytes.Buffer
 | 
			
		||||
		buf.WriteString(fmt.Sprintf("%s\t%s\t", nodeInfo.node.Name, address))
 | 
			
		||||
		for i := 0; i < maxGPUCount; i++ {
 | 
			
		||||
			buf.WriteString(fmt.Sprintf("%s\t", gpuMemInfos[i]))
 | 
			
		||||
		}
 | 
			
		||||
		if hasPendingGPU {
 | 
			
		||||
			buf.WriteString(fmt.Sprintf("%s\t", pendingGPUMemInfo))
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		buf.WriteString(fmt.Sprintf("%s\n", nodeGPUMemInfo))
 | 
			
		||||
		fmt.Fprintf(w, buf.String())
 | 
			
		||||
 | 
			
		||||
		if prtLineLen == 0 {
 | 
			
		||||
			prtLineLen = buf.Len() + 20
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		usedGPUMemInCluster += int64(usedGPUMemInNode)
 | 
			
		||||
		totalGPUMemInCluster += int64(totalGPUMemInNode)
 | 
			
		||||
	}
 | 
			
		||||
	// fmt.Fprintf(w, "-----------------------------------------------------------------------------------------\n")
 | 
			
		||||
	var prtLine bytes.Buffer
 | 
			
		||||
	for i := 0; i < prtLineLen; i++ {
 | 
			
		||||
		prtLine.WriteString("-")
 | 
			
		||||
	}
 | 
			
		||||
	prtLine.WriteString("\n")
 | 
			
		||||
	fmt.Fprint(w, prtLine.String())
 | 
			
		||||
 | 
			
		||||
	fmt.Fprintf(w, "Allocated/Total GPU Memory In Cluster:\n")
 | 
			
		||||
	log.V(2).Infof("gpu: %s, allocated GPU Memory %s", strconv.FormatInt(totalGPUMemInCluster, 10),
 | 
			
		||||
		strconv.FormatInt(usedGPUMemInCluster, 10))
 | 
			
		||||
	var gpuUsage float64 = 0
 | 
			
		||||
	if totalGPUMemInCluster > 0 {
 | 
			
		||||
		gpuUsage = float64(usedGPUMemInCluster) / float64(totalGPUMemInCluster) * 100
 | 
			
		||||
	}
 | 
			
		||||
	fmt.Fprintf(w, "%s/%s (%d%%)\t\n",
 | 
			
		||||
		strconv.FormatInt(usedGPUMemInCluster, 10),
 | 
			
		||||
		strconv.FormatInt(totalGPUMemInCluster, 10),
 | 
			
		||||
		int64(gpuUsage))
 | 
			
		||||
	// fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", ...)
 | 
			
		||||
 | 
			
		||||
	_ = w.Flush()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getGPUMemoryInPod(pod v1.Pod) int {
 | 
			
		||||
	gpuMem := 0
 | 
			
		||||
	for _, container := range pod.Spec.Containers {
 | 
			
		||||
		if val, ok := container.Resources.Limits[resourceName]; ok {
 | 
			
		||||
			gpuMem += int(val.Value())
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return gpuMem
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										74
									
								
								gpushare-device-plugin/cmd/inspect/main.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								gpushare-device-plugin/cmd/inspect/main.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,74 @@
 | 
			
		||||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"flag"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"os"
 | 
			
		||||
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	resourceName         = "rainbond.com/gpu-mem"
 | 
			
		||||
	countName            = "rainbond.com/gpu-count"
 | 
			
		||||
	gpuCountKey          = "aliyun.accelerator/nvidia_count"
 | 
			
		||||
	cardNameKey          = "aliyun.accelerator/nvidia_name"
 | 
			
		||||
	gpuMemKey            = "aliyun.accelerator/nvidia_mem"
 | 
			
		||||
	pluginComponentKey   = "component"
 | 
			
		||||
	pluginComponentValue = "gpushare-device-plugin"
 | 
			
		||||
 | 
			
		||||
	envNVGPUID             = "ALIYUN_COM_GPU_MEM_IDX"
 | 
			
		||||
	envPodGPUMemory        = "ALIYUN_COM_GPU_MEM_POD"
 | 
			
		||||
	envTOTALGPUMEMORY      = "ALIYUN_COM_GPU_MEM_DEV"
 | 
			
		||||
	gpushareAllocationFlag = "scheduler.framework.gpushare.allocation"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func init() {
 | 
			
		||||
	kubeInit()
 | 
			
		||||
	// checkpointInit()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func main() {
 | 
			
		||||
	var nodeName string
 | 
			
		||||
	// nodeName := flag.String("nodeName", "", "nodeName")
 | 
			
		||||
	details := flag.Bool("d", false, "details")
 | 
			
		||||
	flag.Parse()
 | 
			
		||||
 | 
			
		||||
	args := flag.Args()
 | 
			
		||||
	if len(args) > 0 {
 | 
			
		||||
		nodeName = args[0]
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	var pods []v1.Pod
 | 
			
		||||
	var nodes []v1.Node
 | 
			
		||||
	var err error
 | 
			
		||||
 | 
			
		||||
	if nodeName == "" {
 | 
			
		||||
		nodes, err = getAllSharedGPUNode()
 | 
			
		||||
		if err == nil {
 | 
			
		||||
			pods, err = getActivePodsInAllNodes()
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		nodes, err = getNodes(nodeName)
 | 
			
		||||
		if err == nil {
 | 
			
		||||
			pods, err = getActivePodsByNode(nodeName)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		fmt.Printf("Failed due to %v", err)
 | 
			
		||||
		os.Exit(1)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	nodeInfos, err := buildAllNodeInfos(pods, nodes)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		fmt.Printf("Failed due to %v", err)
 | 
			
		||||
		os.Exit(1)
 | 
			
		||||
	}
 | 
			
		||||
	if *details {
 | 
			
		||||
		displayDetails(nodeInfos)
 | 
			
		||||
	} else {
 | 
			
		||||
		displaySummary(nodeInfos)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										271
									
								
								gpushare-device-plugin/cmd/inspect/nodeinfo.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										271
									
								
								gpushare-device-plugin/cmd/inspect/nodeinfo.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,271 @@
 | 
			
		||||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"encoding/json"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"strconv"
 | 
			
		||||
 | 
			
		||||
	log "github.com/golang/glog"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type DeviceInfo struct {
 | 
			
		||||
	idx         int
 | 
			
		||||
	pods        []v1.Pod
 | 
			
		||||
	usedGPUMem  int
 | 
			
		||||
	totalGPUMem int
 | 
			
		||||
	node        v1.Node
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) String() string {
 | 
			
		||||
	if d.idx == -1 {
 | 
			
		||||
		return fmt.Sprintf("%d", d.usedGPUMem)
 | 
			
		||||
	}
 | 
			
		||||
	return fmt.Sprintf("%d/%d", d.usedGPUMem, d.totalGPUMem)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) addGPUPod(pod v1.Pod) {
 | 
			
		||||
	if len(d.pods) == 0 {
 | 
			
		||||
		d.pods = []v1.Pod{}
 | 
			
		||||
	}
 | 
			
		||||
	d.pods = append(d.pods, pod)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type NodeInfo struct {
 | 
			
		||||
	pods           []v1.Pod
 | 
			
		||||
	node           v1.Node
 | 
			
		||||
	devs           map[int]*DeviceInfo
 | 
			
		||||
	gpuCount       int
 | 
			
		||||
	gpuTotalMemory int
 | 
			
		||||
	pluginPod      v1.Pod
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// The key function
 | 
			
		||||
func buildAllNodeInfos(allPods []v1.Pod, nodes []v1.Node) ([]*NodeInfo, error) {
 | 
			
		||||
	nodeInfos := buildNodeInfoWithPods(allPods, nodes)
 | 
			
		||||
	for _, info := range nodeInfos {
 | 
			
		||||
		if info.gpuTotalMemory > 0 {
 | 
			
		||||
			setUnit(info.gpuTotalMemory, info.gpuCount)
 | 
			
		||||
			err := info.buildDeviceInfo()
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.Warningf("Failed due to %v", err)
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return nodeInfos, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) acquirePluginPod() v1.Pod {
 | 
			
		||||
	if n.pluginPod.Name == "" {
 | 
			
		||||
		for _, pod := range n.pods {
 | 
			
		||||
			if val, ok := pod.Labels[pluginComponentKey]; ok {
 | 
			
		||||
				if val == pluginComponentValue {
 | 
			
		||||
					n.pluginPod = pod
 | 
			
		||||
					break
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return n.pluginPod
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getTotalGPUMemory(node v1.Node) int {
 | 
			
		||||
	val, ok := node.Status.Allocatable[resourceName]
 | 
			
		||||
 | 
			
		||||
	if !ok {
 | 
			
		||||
		return 0
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return int(val.Value())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getGPUCountInNode(node v1.Node) int {
 | 
			
		||||
	val, ok := node.Status.Allocatable[countName]
 | 
			
		||||
 | 
			
		||||
	if !ok {
 | 
			
		||||
		return int(0)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return int(val.Value())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func buildNodeInfoWithPods(pods []v1.Pod, nodes []v1.Node) []*NodeInfo {
 | 
			
		||||
	nodeMap := map[string]*NodeInfo{}
 | 
			
		||||
	nodeList := []*NodeInfo{}
 | 
			
		||||
 | 
			
		||||
	for _, node := range nodes {
 | 
			
		||||
		var info *NodeInfo = &NodeInfo{}
 | 
			
		||||
		if value, ok := nodeMap[node.Name]; ok {
 | 
			
		||||
			info = value
 | 
			
		||||
		} else {
 | 
			
		||||
			nodeMap[node.Name] = info
 | 
			
		||||
			info.node = node
 | 
			
		||||
			info.pods = []v1.Pod{}
 | 
			
		||||
			info.gpuCount = getGPUCountInNode(node)
 | 
			
		||||
			info.gpuTotalMemory = getTotalGPUMemory(node)
 | 
			
		||||
			info.devs = map[int]*DeviceInfo{}
 | 
			
		||||
 | 
			
		||||
			for i := 0; i < info.gpuCount; i++ {
 | 
			
		||||
				dev := &DeviceInfo{
 | 
			
		||||
					pods:        []v1.Pod{},
 | 
			
		||||
					idx:         i,
 | 
			
		||||
					totalGPUMem: info.gpuTotalMemory / info.gpuCount,
 | 
			
		||||
					node:        info.node,
 | 
			
		||||
				}
 | 
			
		||||
				info.devs[i] = dev
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for _, pod := range pods {
 | 
			
		||||
			if pod.Spec.NodeName == node.Name {
 | 
			
		||||
				info.pods = append(info.pods, pod)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for _, v := range nodeMap {
 | 
			
		||||
		nodeList = append(nodeList, v)
 | 
			
		||||
	}
 | 
			
		||||
	return nodeList
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) hasPendingGPUMemory() bool {
 | 
			
		||||
	_, found := n.devs[-1]
 | 
			
		||||
	return found
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get used GPUs in checkpoint
 | 
			
		||||
func (n *NodeInfo) buildDeviceInfo() error {
 | 
			
		||||
	totalGPUMem := 0
 | 
			
		||||
	if n.gpuCount > 0 {
 | 
			
		||||
		totalGPUMem = n.gpuTotalMemory / n.gpuCount
 | 
			
		||||
	}
 | 
			
		||||
GPUSearchLoop:
 | 
			
		||||
	for _, pod := range n.pods {
 | 
			
		||||
		if gpuMemoryInPod(pod) <= 0 {
 | 
			
		||||
			continue GPUSearchLoop
 | 
			
		||||
		}
 | 
			
		||||
		for devID, usedGPUMem := range n.getDeivceInfo(pod) {
 | 
			
		||||
			if n.devs[devID] == nil {
 | 
			
		||||
				n.devs[devID] = &DeviceInfo{
 | 
			
		||||
					pods:        []v1.Pod{},
 | 
			
		||||
					idx:         devID,
 | 
			
		||||
					totalGPUMem: totalGPUMem,
 | 
			
		||||
					node:        n.node,
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			n.devs[devID].usedGPUMem += usedGPUMem
 | 
			
		||||
			n.devs[devID].pods = append(n.devs[devID].pods, pod)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) getDeivceInfo(pod v1.Pod) map[int]int {
 | 
			
		||||
	var err error
 | 
			
		||||
	id := -1
 | 
			
		||||
	allocation := map[int]int{}
 | 
			
		||||
	allocation = GetAllocation(&pod)
 | 
			
		||||
	if len(allocation) != 0 {
 | 
			
		||||
		return allocation
 | 
			
		||||
	}
 | 
			
		||||
	if len(pod.ObjectMeta.Annotations) > 0 {
 | 
			
		||||
		value, found := pod.ObjectMeta.Annotations[envNVGPUID]
 | 
			
		||||
		if found {
 | 
			
		||||
			id, err = strconv.Atoi(value)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.Warningf("Failed to parse dev id %s due to %v for pod %s in ns %s",
 | 
			
		||||
					value,
 | 
			
		||||
					err,
 | 
			
		||||
					pod.Name,
 | 
			
		||||
					pod.Namespace)
 | 
			
		||||
				id = -1
 | 
			
		||||
			}
 | 
			
		||||
		} else {
 | 
			
		||||
			log.Warningf("Failed to get dev id %s for pod %s in ns %s",
 | 
			
		||||
				pod.Name,
 | 
			
		||||
				pod.Namespace)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	allocation[id] = gpuMemoryInPod(pod)
 | 
			
		||||
	return allocation
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func hasPendingGPUMemory(nodeInfos []*NodeInfo) (found bool) {
 | 
			
		||||
	for _, info := range nodeInfos {
 | 
			
		||||
		if info.hasPendingGPUMemory() {
 | 
			
		||||
			return true
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return false
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getNodes(nodeName string) ([]v1.Node, error) {
 | 
			
		||||
	node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
 | 
			
		||||
	return []v1.Node{*node}, err
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func isGPUSharingNode(node v1.Node) bool {
 | 
			
		||||
	value, ok := node.Status.Allocatable[resourceName]
 | 
			
		||||
 | 
			
		||||
	if ok {
 | 
			
		||||
		ok = (int(value.Value()) > 0)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return ok
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	memoryUnit = ""
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func setUnit(gpuMemory, gpuCount int) {
 | 
			
		||||
	if memoryUnit != "" {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if gpuCount == 0 {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	gpuMemoryByDev := gpuMemory / gpuCount
 | 
			
		||||
 | 
			
		||||
	if gpuMemoryByDev > 100 {
 | 
			
		||||
		memoryUnit = "MiB"
 | 
			
		||||
	} else {
 | 
			
		||||
		memoryUnit = "GiB"
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
func GetAllocation(pod *v1.Pod) map[int]int {
 | 
			
		||||
	podGPUMems := map[int]int{}
 | 
			
		||||
	allocationString := ""
 | 
			
		||||
	if pod.ObjectMeta.Annotations == nil {
 | 
			
		||||
		return podGPUMems
 | 
			
		||||
	}
 | 
			
		||||
	value, ok := pod.ObjectMeta.Annotations[gpushareAllocationFlag]
 | 
			
		||||
	if !ok {
 | 
			
		||||
		return podGPUMems
 | 
			
		||||
	}
 | 
			
		||||
	allocationString = value
 | 
			
		||||
	var allocation map[int]map[string]int
 | 
			
		||||
	err := json.Unmarshal([]byte(allocationString), &allocation)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return podGPUMems
 | 
			
		||||
	}
 | 
			
		||||
	for _, containerAllocation := range allocation {
 | 
			
		||||
		for id, gpuMem := range containerAllocation {
 | 
			
		||||
			gpuIndex, err := strconv.Atoi(id)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.Errorf("failed to get gpu memory from pod annotation,reason: %v", err)
 | 
			
		||||
				return map[int]int{}
 | 
			
		||||
			}
 | 
			
		||||
			podGPUMems[gpuIndex] += gpuMem
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return podGPUMems
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										134
									
								
								gpushare-device-plugin/cmd/inspect/podinfo.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										134
									
								
								gpushare-device-plugin/cmd/inspect/podinfo.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,134 @@
 | 
			
		||||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"os"
 | 
			
		||||
	"path"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	log "github.com/golang/glog"
 | 
			
		||||
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/fields"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/labels"
 | 
			
		||||
	"k8s.io/client-go/kubernetes"
 | 
			
		||||
	"k8s.io/client-go/rest"
 | 
			
		||||
	"k8s.io/client-go/tools/clientcmd"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	clientConfig clientcmd.ClientConfig
 | 
			
		||||
	clientset    *kubernetes.Clientset
 | 
			
		||||
	restConfig   *rest.Config
 | 
			
		||||
	retries      = 5
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func kubeInit() {
 | 
			
		||||
 | 
			
		||||
	kubeconfigFile := os.Getenv("KUBECONFIG")
 | 
			
		||||
	if kubeconfigFile == "" {
 | 
			
		||||
		kubeconfigFile = path.Join(os.Getenv("HOME"), "/.kube/config")
 | 
			
		||||
	}
 | 
			
		||||
	if _, err := os.Stat(kubeconfigFile); err != nil {
 | 
			
		||||
		log.Fatalf("kubeconfig %s failed to find due to %v, please set KUBECONFIG env", kubeconfigFile, err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	var err error
 | 
			
		||||
	restConfig, err = clientcmd.BuildConfigFromFlags("", kubeconfigFile)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("Failed due to %v", err)
 | 
			
		||||
	}
 | 
			
		||||
	clientset, err = kubernetes.NewForConfig(restConfig)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("Failed due to %v", err)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type podInfo struct {
 | 
			
		||||
	name      string
 | 
			
		||||
	namespace string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p podInfo) equal(p1 podInfo) bool {
 | 
			
		||||
	return p.name == p1.name && p.namespace == p1.namespace
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getActivePodsByNode(nodeName string) ([]v1.Pod, error) {
 | 
			
		||||
	selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName})
 | 
			
		||||
	pods, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
 | 
			
		||||
		FieldSelector: selector.String(),
 | 
			
		||||
		LabelSelector: labels.Everything().String(),
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	for i := 0; i < retries && err != nil; i++ {
 | 
			
		||||
		pods, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
 | 
			
		||||
			FieldSelector: selector.String(),
 | 
			
		||||
			LabelSelector: labels.Everything().String(),
 | 
			
		||||
		})
 | 
			
		||||
		time.Sleep(100 * time.Millisecond)
 | 
			
		||||
	}
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return []v1.Pod{}, fmt.Errorf("failed to get Pods in node %v", nodeName)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return filterActivePods(pods.Items), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getActivePodsInAllNodes() ([]v1.Pod, error) {
 | 
			
		||||
	pods, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
 | 
			
		||||
		LabelSelector: labels.Everything().String(),
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	for i := 0; i < retries && err != nil; i++ {
 | 
			
		||||
		pods, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
 | 
			
		||||
			LabelSelector: labels.Everything().String(),
 | 
			
		||||
		})
 | 
			
		||||
		time.Sleep(100 * time.Millisecond)
 | 
			
		||||
	}
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return []v1.Pod{}, fmt.Errorf("failed to get Pods")
 | 
			
		||||
	}
 | 
			
		||||
	return filterActivePods(pods.Items), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func filterActivePods(pods []v1.Pod) (activePods []v1.Pod) {
 | 
			
		||||
	activePods = []v1.Pod{}
 | 
			
		||||
	for _, pod := range pods {
 | 
			
		||||
		if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		activePods = append(activePods, pod)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return activePods
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getAllSharedGPUNode() ([]v1.Node, error) {
 | 
			
		||||
	nodes := []v1.Node{}
 | 
			
		||||
	allNodes, err := clientset.CoreV1().Nodes().List(metav1.ListOptions{})
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nodes, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for _, item := range allNodes.Items {
 | 
			
		||||
		if isGPUSharingNode(item) {
 | 
			
		||||
			nodes = append(nodes, item)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nodes, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func gpuMemoryInPod(pod v1.Pod) int {
 | 
			
		||||
	var total int
 | 
			
		||||
	containers := pod.Spec.Containers
 | 
			
		||||
	for _, container := range containers {
 | 
			
		||||
		if val, ok := container.Resources.Limits[resourceName]; ok {
 | 
			
		||||
			total += int(val.Value())
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return total
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user