synchronization

2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions
--- a/gpushare-device-plugin/cmd/inspect/display.go
+++ b/gpushare-device-plugin/cmd/inspect/display.go
@@ -0,0 +1,255 @@
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"strconv"
+	"text/tabwriter"
+
+	log "github.com/golang/glog"
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func displayDetails(nodeInfos []*NodeInfo) {
+	w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+	var (
+		totalGPUMemInCluster int64
+		usedGPUMemInCluster  int64
+		prtLineLen           int
+	)
+
+	for _, nodeInfo := range nodeInfos {
+		address := "unknown"
+		if len(nodeInfo.node.Status.Addresses) > 0 {
+			//address = nodeInfo.node.Status.Addresses[0].Address
+			for _, addr := range nodeInfo.node.Status.Addresses {
+				if addr.Type == v1.NodeInternalIP {
+					address = addr.Address
+					break
+				}
+			}
+		}
+
+		totalGPUMemInNode := nodeInfo.gpuTotalMemory
+		if totalGPUMemInNode <= 0 {
+			continue
+		}
+
+		fmt.Fprintf(w, "\n")
+		fmt.Fprintf(w, "NAME:\t%s\n", nodeInfo.node.Name)
+		fmt.Fprintf(w, "IPADDRESS:\t%s\n", address)
+		fmt.Fprintf(w, "\n")
+
+		usedGPUMemInNode := 0
+		var buf bytes.Buffer
+		buf.WriteString("NAME\tNAMESPACE\t")
+		for i := 0; i < nodeInfo.gpuCount; i++ {
+			buf.WriteString(fmt.Sprintf("GPU%d(Allocated)\t", i))
+		}
+
+		if nodeInfo.hasPendingGPUMemory() {
+			buf.WriteString("Pending(Allocated)\t")
+		}
+		buf.WriteString("\n")
+		fmt.Fprintf(w, buf.String())
+
+		var buffer bytes.Buffer
+		exists := map[types.UID]bool{}
+		for i, dev := range nodeInfo.devs {
+			usedGPUMemInNode += dev.usedGPUMem
+			for _, pod := range dev.pods {
+				if _,ok := exists[pod.UID]; ok {
+					continue 
+				}
+				buffer.WriteString(fmt.Sprintf("%s\t%s\t", pod.Name, pod.Namespace))
+				count := nodeInfo.gpuCount
+				if nodeInfo.hasPendingGPUMemory() {
+					count += 1
+				}
+
+				for k := 0; k < count; k++ {
+					allocation := GetAllocation(&pod) 
+					if len(allocation) != 0 {
+						buffer.WriteString(fmt.Sprintf("%d\t", allocation[k]))
+						continue 
+					}
+					if k == i || (i == -1 && k == nodeInfo.gpuCount) {
+						buffer.WriteString(fmt.Sprintf("%d\t", getGPUMemoryInPod(pod)))
+					} else {
+						buffer.WriteString("0\t")
+					}
+				}
+				buffer.WriteString("\n")
+				exists[pod.UID] = true 
+			}
+		}
+		if prtLineLen == 0 {
+			prtLineLen = buffer.Len() + 10
+		}
+		fmt.Fprintf(w, buffer.String())
+
+		var gpuUsageInNode float64 = 0
+		if totalGPUMemInNode > 0 {
+			gpuUsageInNode = float64(usedGPUMemInNode) / float64(totalGPUMemInNode) * 100
+		} else {
+			fmt.Fprintf(w, "\n")
+		}
+
+		fmt.Fprintf(w, "Allocated :\t%d (%d%%)\t\n", usedGPUMemInNode, int64(gpuUsageInNode))
+		fmt.Fprintf(w, "Total :\t%d \t\n", nodeInfo.gpuTotalMemory)
+		// fmt.Fprintf(w, "-----------------------------------------------------------------------------------------\n")
+		var prtLine bytes.Buffer
+		for i := 0; i < prtLineLen; i++ {
+			prtLine.WriteString("-")
+		}
+		prtLine.WriteString("\n")
+		fmt.Fprintf(w, prtLine.String())
+		totalGPUMemInCluster += int64(totalGPUMemInNode)
+		usedGPUMemInCluster += int64(usedGPUMemInNode)
+	}
+	fmt.Fprintf(w, "\n")
+	fmt.Fprintf(w, "\n")
+	fmt.Fprintf(w, "Allocated/Total GPU Memory In Cluster:\t")
+	log.V(2).Infof("gpu: %s, allocated GPU Memory %s", strconv.FormatInt(totalGPUMemInCluster, 10),
+		strconv.FormatInt(usedGPUMemInCluster, 10))
+
+	var gpuUsage float64 = 0
+	if totalGPUMemInCluster > 0 {
+		gpuUsage = float64(usedGPUMemInCluster) / float64(totalGPUMemInCluster) * 100
+	}
+	fmt.Fprintf(w, "%s/%s (%d%%)\t\n",
+		strconv.FormatInt(usedGPUMemInCluster, 10),
+		strconv.FormatInt(totalGPUMemInCluster, 10),
+		int64(gpuUsage))
+	// fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", ...)
+
+	_ = w.Flush()
+}
+
+func getMaxGPUCount(nodeInfos []*NodeInfo) (max int) {
+	for _, node := range nodeInfos {
+		if node.gpuCount > max {
+			max = node.gpuCount
+		}
+	}
+
+	return max
+}
+
+func displaySummary(nodeInfos []*NodeInfo) {
+	w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+	var (
+		maxGPUCount          int
+		totalGPUMemInCluster int64
+		usedGPUMemInCluster  int64
+		prtLineLen           int
+	)
+
+	hasPendingGPU := hasPendingGPUMemory(nodeInfos)
+
+	maxGPUCount = getMaxGPUCount(nodeInfos)
+
+	var buffer bytes.Buffer
+	buffer.WriteString("NAME\tIPADDRESS\t")
+	for i := 0; i < maxGPUCount; i++ {
+		buffer.WriteString(fmt.Sprintf("GPU%d(Allocated/Total)\t", i))
+	}
+
+	if hasPendingGPU {
+		buffer.WriteString("PENDING(Allocated)\t")
+	}
+	buffer.WriteString(fmt.Sprintf("GPU Memory(%s)\n", memoryUnit))
+
+	// fmt.Fprintf(w, "NAME\tIPADDRESS\tROLE\tGPU(Allocated/Total)\tPENDING(Allocated)\n")
+	fmt.Fprintf(w, buffer.String())
+	for _, nodeInfo := range nodeInfos {
+		address := "unknown"
+		if len(nodeInfo.node.Status.Addresses) > 0 {
+			// address = nodeInfo.node.Status.Addresses[0].Address
+			for _, addr := range nodeInfo.node.Status.Addresses {
+				if addr.Type == v1.NodeInternalIP {
+					address = addr.Address
+					break
+				}
+			}
+		}
+
+		gpuMemInfos := []string{}
+		pendingGPUMemInfo := ""
+		usedGPUMemInNode := 0
+		totalGPUMemInNode := nodeInfo.gpuTotalMemory
+		if totalGPUMemInNode <= 0 {
+			continue
+		}
+
+		for i := 0; i < maxGPUCount; i++ {
+			gpuMemInfo := "0/0"
+			if dev, ok := nodeInfo.devs[i]; ok {
+				gpuMemInfo = dev.String()
+				usedGPUMemInNode += dev.usedGPUMem
+			}
+			gpuMemInfos = append(gpuMemInfos, gpuMemInfo)
+		}
+
+		// check if there is pending dev
+		if dev, ok := nodeInfo.devs[-1]; ok {
+			pendingGPUMemInfo = fmt.Sprintf("%d", dev.usedGPUMem)
+			usedGPUMemInNode += dev.usedGPUMem
+		}
+
+		nodeGPUMemInfo := fmt.Sprintf("%d/%d", usedGPUMemInNode, totalGPUMemInNode)
+
+		var buf bytes.Buffer
+		buf.WriteString(fmt.Sprintf("%s\t%s\t", nodeInfo.node.Name, address))
+		for i := 0; i < maxGPUCount; i++ {
+			buf.WriteString(fmt.Sprintf("%s\t", gpuMemInfos[i]))
+		}
+		if hasPendingGPU {
+			buf.WriteString(fmt.Sprintf("%s\t", pendingGPUMemInfo))
+		}
+
+		buf.WriteString(fmt.Sprintf("%s\n", nodeGPUMemInfo))
+		fmt.Fprintf(w, buf.String())
+
+		if prtLineLen == 0 {
+			prtLineLen = buf.Len() + 20
+		}
+
+		usedGPUMemInCluster += int64(usedGPUMemInNode)
+		totalGPUMemInCluster += int64(totalGPUMemInNode)
+	}
+	// fmt.Fprintf(w, "-----------------------------------------------------------------------------------------\n")
+	var prtLine bytes.Buffer
+	for i := 0; i < prtLineLen; i++ {
+		prtLine.WriteString("-")
+	}
+	prtLine.WriteString("\n")
+	fmt.Fprint(w, prtLine.String())
+
+	fmt.Fprintf(w, "Allocated/Total GPU Memory In Cluster:\n")
+	log.V(2).Infof("gpu: %s, allocated GPU Memory %s", strconv.FormatInt(totalGPUMemInCluster, 10),
+		strconv.FormatInt(usedGPUMemInCluster, 10))
+	var gpuUsage float64 = 0
+	if totalGPUMemInCluster > 0 {
+		gpuUsage = float64(usedGPUMemInCluster) / float64(totalGPUMemInCluster) * 100
+	}
+	fmt.Fprintf(w, "%s/%s (%d%%)\t\n",
+		strconv.FormatInt(usedGPUMemInCluster, 10),
+		strconv.FormatInt(totalGPUMemInCluster, 10),
+		int64(gpuUsage))
+	// fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", ...)
+
+	_ = w.Flush()
+}
+
+func getGPUMemoryInPod(pod v1.Pod) int {
+	gpuMem := 0
+	for _, container := range pod.Spec.Containers {
+		if val, ok := container.Resources.Limits[resourceName]; ok {
+			gpuMem += int(val.Value())
+		}
+	}
+	return gpuMem
+}
--- a/gpushare-device-plugin/cmd/inspect/main.go
+++ b/gpushare-device-plugin/cmd/inspect/main.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	v1 "k8s.io/api/core/v1"
+)
+
+const (
+	resourceName         = "rainbond.com/gpu-mem"
+	countName            = "rainbond.com/gpu-count"
+	gpuCountKey          = "aliyun.accelerator/nvidia_count"
+	cardNameKey          = "aliyun.accelerator/nvidia_name"
+	gpuMemKey            = "aliyun.accelerator/nvidia_mem"
+	pluginComponentKey   = "component"
+	pluginComponentValue = "gpushare-device-plugin"
+
+	envNVGPUID             = "ALIYUN_COM_GPU_MEM_IDX"
+	envPodGPUMemory        = "ALIYUN_COM_GPU_MEM_POD"
+	envTOTALGPUMEMORY      = "ALIYUN_COM_GPU_MEM_DEV"
+	gpushareAllocationFlag = "scheduler.framework.gpushare.allocation"
+)
+
+func init() {
+	kubeInit()
+	// checkpointInit()
+}
+
+func main() {
+	var nodeName string
+	// nodeName := flag.String("nodeName", "", "nodeName")
+	details := flag.Bool("d", false, "details")
+	flag.Parse()
+
+	args := flag.Args()
+	if len(args) > 0 {
+		nodeName = args[0]
+	}
+
+	var pods []v1.Pod
+	var nodes []v1.Node
+	var err error
+
+	if nodeName == "" {
+		nodes, err = getAllSharedGPUNode()
+		if err == nil {
+			pods, err = getActivePodsInAllNodes()
+		}
+	} else {
+		nodes, err = getNodes(nodeName)
+		if err == nil {
+			pods, err = getActivePodsByNode(nodeName)
+		}
+	}
+
+	if err != nil {
+		fmt.Printf("Failed due to %v", err)
+		os.Exit(1)
+	}
+
+	nodeInfos, err := buildAllNodeInfos(pods, nodes)
+	if err != nil {
+		fmt.Printf("Failed due to %v", err)
+		os.Exit(1)
+	}
+	if *details {
+		displayDetails(nodeInfos)
+	} else {
+		displaySummary(nodeInfos)
+	}
+
+}
--- a/gpushare-device-plugin/cmd/inspect/nodeinfo.go
+++ b/gpushare-device-plugin/cmd/inspect/nodeinfo.go
@@ -0,0 +1,271 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"strconv"
+
+	log "github.com/golang/glog"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	v1 "k8s.io/api/core/v1"
+)
+
+type DeviceInfo struct {
+	idx         int
+	pods        []v1.Pod
+	usedGPUMem  int
+	totalGPUMem int
+	node        v1.Node
+}
+
+func (d *DeviceInfo) String() string {
+	if d.idx == -1 {
+		return fmt.Sprintf("%d", d.usedGPUMem)
+	}
+	return fmt.Sprintf("%d/%d", d.usedGPUMem, d.totalGPUMem)
+}
+
+func (d *DeviceInfo) addGPUPod(pod v1.Pod) {
+	if len(d.pods) == 0 {
+		d.pods = []v1.Pod{}
+	}
+	d.pods = append(d.pods, pod)
+}
+
+type NodeInfo struct {
+	pods           []v1.Pod
+	node           v1.Node
+	devs           map[int]*DeviceInfo
+	gpuCount       int
+	gpuTotalMemory int
+	pluginPod      v1.Pod
+}
+
+// The key function
+func buildAllNodeInfos(allPods []v1.Pod, nodes []v1.Node) ([]*NodeInfo, error) {
+	nodeInfos := buildNodeInfoWithPods(allPods, nodes)
+	for _, info := range nodeInfos {
+		if info.gpuTotalMemory > 0 {
+			setUnit(info.gpuTotalMemory, info.gpuCount)
+			err := info.buildDeviceInfo()
+			if err != nil {
+				log.Warningf("Failed due to %v", err)
+				continue
+			}
+		}
+	}
+	return nodeInfos, nil
+}
+
+func (n *NodeInfo) acquirePluginPod() v1.Pod {
+	if n.pluginPod.Name == "" {
+		for _, pod := range n.pods {
+			if val, ok := pod.Labels[pluginComponentKey]; ok {
+				if val == pluginComponentValue {
+					n.pluginPod = pod
+					break
+				}
+			}
+		}
+	}
+	return n.pluginPod
+}
+
+func getTotalGPUMemory(node v1.Node) int {
+	val, ok := node.Status.Allocatable[resourceName]
+
+	if !ok {
+		return 0
+	}
+
+	return int(val.Value())
+}
+
+func getGPUCountInNode(node v1.Node) int {
+	val, ok := node.Status.Allocatable[countName]
+
+	if !ok {
+		return int(0)
+	}
+
+	return int(val.Value())
+}
+
+func buildNodeInfoWithPods(pods []v1.Pod, nodes []v1.Node) []*NodeInfo {
+	nodeMap := map[string]*NodeInfo{}
+	nodeList := []*NodeInfo{}
+
+	for _, node := range nodes {
+		var info *NodeInfo = &NodeInfo{}
+		if value, ok := nodeMap[node.Name]; ok {
+			info = value
+		} else {
+			nodeMap[node.Name] = info
+			info.node = node
+			info.pods = []v1.Pod{}
+			info.gpuCount = getGPUCountInNode(node)
+			info.gpuTotalMemory = getTotalGPUMemory(node)
+			info.devs = map[int]*DeviceInfo{}
+
+			for i := 0; i < info.gpuCount; i++ {
+				dev := &DeviceInfo{
+					pods:        []v1.Pod{},
+					idx:         i,
+					totalGPUMem: info.gpuTotalMemory / info.gpuCount,
+					node:        info.node,
+				}
+				info.devs[i] = dev
+			}
+
+		}
+
+		for _, pod := range pods {
+			if pod.Spec.NodeName == node.Name {
+				info.pods = append(info.pods, pod)
+			}
+		}
+	}
+
+	for _, v := range nodeMap {
+		nodeList = append(nodeList, v)
+	}
+	return nodeList
+}
+
+func (n *NodeInfo) hasPendingGPUMemory() bool {
+	_, found := n.devs[-1]
+	return found
+}
+
+// Get used GPUs in checkpoint
+func (n *NodeInfo) buildDeviceInfo() error {
+	totalGPUMem := 0
+	if n.gpuCount > 0 {
+		totalGPUMem = n.gpuTotalMemory / n.gpuCount
+	}
+GPUSearchLoop:
+	for _, pod := range n.pods {
+		if gpuMemoryInPod(pod) <= 0 {
+			continue GPUSearchLoop
+		}
+		for devID, usedGPUMem := range n.getDeivceInfo(pod) {
+			if n.devs[devID] == nil {
+				n.devs[devID] = &DeviceInfo{
+					pods:        []v1.Pod{},
+					idx:         devID,
+					totalGPUMem: totalGPUMem,
+					node:        n.node,
+				}
+			}
+			n.devs[devID].usedGPUMem += usedGPUMem
+			n.devs[devID].pods = append(n.devs[devID].pods, pod)
+		}
+	}
+	return nil
+}
+
+func (n *NodeInfo) getDeivceInfo(pod v1.Pod) map[int]int {
+	var err error
+	id := -1
+	allocation := map[int]int{}
+	allocation = GetAllocation(&pod)
+	if len(allocation) != 0 {
+		return allocation
+	}
+	if len(pod.ObjectMeta.Annotations) > 0 {
+		value, found := pod.ObjectMeta.Annotations[envNVGPUID]
+		if found {
+			id, err = strconv.Atoi(value)
+			if err != nil {
+				log.Warningf("Failed to parse dev id %s due to %v for pod %s in ns %s",
+					value,
+					err,
+					pod.Name,
+					pod.Namespace)
+				id = -1
+			}
+		} else {
+			log.Warningf("Failed to get dev id %s for pod %s in ns %s",
+				pod.Name,
+				pod.Namespace)
+		}
+	}
+	allocation[id] = gpuMemoryInPod(pod)
+	return allocation
+}
+
+func hasPendingGPUMemory(nodeInfos []*NodeInfo) (found bool) {
+	for _, info := range nodeInfos {
+		if info.hasPendingGPUMemory() {
+			return true
+		}
+	}
+
+	return false
+}
+
+func getNodes(nodeName string) ([]v1.Node, error) {
+	node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
+	return []v1.Node{*node}, err
+}
+
+func isGPUSharingNode(node v1.Node) bool {
+	value, ok := node.Status.Allocatable[resourceName]
+
+	if ok {
+		ok = (int(value.Value()) > 0)
+	}
+
+	return ok
+}
+
+var (
+	memoryUnit = ""
+)
+
+func setUnit(gpuMemory, gpuCount int) {
+	if memoryUnit != "" {
+		return
+	}
+
+	if gpuCount == 0 {
+		return
+	}
+
+	gpuMemoryByDev := gpuMemory / gpuCount
+
+	if gpuMemoryByDev > 100 {
+		memoryUnit = "MiB"
+	} else {
+		memoryUnit = "GiB"
+	}
+}
+func GetAllocation(pod *v1.Pod) map[int]int {
+	podGPUMems := map[int]int{}
+	allocationString := ""
+	if pod.ObjectMeta.Annotations == nil {
+		return podGPUMems
+	}
+	value, ok := pod.ObjectMeta.Annotations[gpushareAllocationFlag]
+	if !ok {
+		return podGPUMems
+	}
+	allocationString = value
+	var allocation map[int]map[string]int
+	err := json.Unmarshal([]byte(allocationString), &allocation)
+	if err != nil {
+		return podGPUMems
+	}
+	for _, containerAllocation := range allocation {
+		for id, gpuMem := range containerAllocation {
+			gpuIndex, err := strconv.Atoi(id)
+			if err != nil {
+				log.Errorf("failed to get gpu memory from pod annotation,reason: %v", err)
+				return map[int]int{}
+			}
+			podGPUMems[gpuIndex] += gpuMem
+		}
+	}
+	return podGPUMems
+}
--- a/gpushare-device-plugin/cmd/inspect/podinfo.go
+++ b/gpushare-device-plugin/cmd/inspect/podinfo.go
@@ -0,0 +1,134 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"path"
+	"time"
+
+	log "github.com/golang/glog"
+
+	"k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/fields"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/clientcmd"
+)
+
+var (
+	clientConfig clientcmd.ClientConfig
+	clientset    *kubernetes.Clientset
+	restConfig   *rest.Config
+	retries      = 5
+)
+
+func kubeInit() {
+
+	kubeconfigFile := os.Getenv("KUBECONFIG")
+	if kubeconfigFile == "" {
+		kubeconfigFile = path.Join(os.Getenv("HOME"), "/.kube/config")
+	}
+	if _, err := os.Stat(kubeconfigFile); err != nil {
+		log.Fatalf("kubeconfig %s failed to find due to %v, please set KUBECONFIG env", kubeconfigFile, err)
+	}
+
+	var err error
+	restConfig, err = clientcmd.BuildConfigFromFlags("", kubeconfigFile)
+	if err != nil {
+		log.Fatalf("Failed due to %v", err)
+	}
+	clientset, err = kubernetes.NewForConfig(restConfig)
+	if err != nil {
+		log.Fatalf("Failed due to %v", err)
+	}
+}
+
+type podInfo struct {
+	name      string
+	namespace string
+}
+
+func (p podInfo) equal(p1 podInfo) bool {
+	return p.name == p1.name && p.namespace == p1.namespace
+}
+
+func getActivePodsByNode(nodeName string) ([]v1.Pod, error) {
+	selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName})
+	pods, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
+		FieldSelector: selector.String(),
+		LabelSelector: labels.Everything().String(),
+	})
+
+	for i := 0; i < retries && err != nil; i++ {
+		pods, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
+			FieldSelector: selector.String(),
+			LabelSelector: labels.Everything().String(),
+		})
+		time.Sleep(100 * time.Millisecond)
+	}
+	if err != nil {
+		return []v1.Pod{}, fmt.Errorf("failed to get Pods in node %v", nodeName)
+	}
+
+	return filterActivePods(pods.Items), nil
+}
+
+func getActivePodsInAllNodes() ([]v1.Pod, error) {
+	pods, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
+		LabelSelector: labels.Everything().String(),
+	})
+
+	for i := 0; i < retries && err != nil; i++ {
+		pods, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
+			LabelSelector: labels.Everything().String(),
+		})
+		time.Sleep(100 * time.Millisecond)
+	}
+	if err != nil {
+		return []v1.Pod{}, fmt.Errorf("failed to get Pods")
+	}
+	return filterActivePods(pods.Items), nil
+}
+
+func filterActivePods(pods []v1.Pod) (activePods []v1.Pod) {
+	activePods = []v1.Pod{}
+	for _, pod := range pods {
+		if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
+			continue
+		}
+
+		activePods = append(activePods, pod)
+	}
+
+	return activePods
+}
+
+func getAllSharedGPUNode() ([]v1.Node, error) {
+	nodes := []v1.Node{}
+	allNodes, err := clientset.CoreV1().Nodes().List(metav1.ListOptions{})
+	if err != nil {
+		return nodes, err
+	}
+
+	for _, item := range allNodes.Items {
+		if isGPUSharingNode(item) {
+			nodes = append(nodes, item)
+		}
+	}
+
+	return nodes, nil
+}
+
+func gpuMemoryInPod(pod v1.Pod) int {
+	var total int
+	containers := pod.Spec.Containers
+	for _, container := range containers {
+		if val, ok := container.Resources.Limits[resourceName]; ok {
+			total += int(val.Value())
+		}
+	}
+
+	return total
+}
--- a/gpushare-device-plugin/cmd/nvidia/main.go
+++ b/gpushare-device-plugin/cmd/nvidia/main.go
@@ -0,0 +1,78 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"time"
+
+	"github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia"
+	"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
+	log "github.com/golang/glog"
+	"k8s.io/client-go/rest"
+)
+
+var (
+	mps              = flag.Bool("mps", false, "Enable or Disable MPS")
+	healthCheck      = flag.Bool("health-check", false, "Enable or disable Health check")
+	memoryUnit       = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
+	queryFromKubelet = flag.Bool("query-kubelet", false, "Query pending pods from kubelet instead of kube-apiserver")
+	kubeletAddress   = flag.String("kubelet-address", "0.0.0.0", "Kubelet IP Address")
+	kubeletPort      = flag.Uint("kubelet-port", 10250, "Kubelet listened Port")
+	clientCert       = flag.String("client-cert", "", "Kubelet TLS client certificate")
+	clientKey        = flag.String("client-key", "", "Kubelet TLS client key")
+	token            = flag.String("token", "", "Kubelet client bearer token")
+	timeout          = flag.Int("timeout", 10, "Kubelet client http timeout duration")
+)
+
+func buildKubeletClient() *client.KubeletClient {
+	if *clientCert == "" && *clientKey == "" && *token == "" {
+		tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
+		if err != nil {
+			panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err))
+		}
+		tokenStr := string(tokenByte)
+		token = &tokenStr
+	}
+	kubeletClient, err := client.NewKubeletClient(&client.KubeletClientConfig{
+		Address: *kubeletAddress,
+		Port:    *kubeletPort,
+		TLSClientConfig: rest.TLSClientConfig{
+			Insecure:   true,
+			ServerName: "gpushare-device-plugin",
+			CertFile:   *clientCert,
+			KeyFile:    *clientKey,
+		},
+		BearerToken: *token,
+		HTTPTimeout: time.Duration(*timeout) * time.Second,
+	})
+	if err != nil {
+		panic(err)
+	}
+	return kubeletClient
+}
+
+func main() {
+	flag.Parse()
+	log.V(1).Infoln("Start gpushare device plugin")
+
+	kubeletClient := buildKubeletClient()
+	ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *queryFromKubelet, translatememoryUnits(*memoryUnit), kubeletClient)
+	err := ngm.Run()
+	if err != nil {
+		log.Fatalf("Failed due to %v", err)
+	}
+}
+
+func translatememoryUnits(value string) nvidia.MemoryUnit {
+	memoryUnit := nvidia.MemoryUnit(value)
+	switch memoryUnit {
+	case nvidia.MiBPrefix:
+	case nvidia.GiBPrefix:
+	default:
+		log.Warningf("Unsupported memory unit: %s, use memoryUnit Gi as default", value)
+		memoryUnit = nvidia.GiBPrefix
+	}
+
+	return memoryUnit
+}
--- a/gpushare-device-plugin/cmd/podgetter/main.go
+++ b/gpushare-device-plugin/cmd/podgetter/main.go
@@ -0,0 +1,57 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
+	"io/ioutil"
+	"k8s.io/client-go/rest"
+	"time"
+)
+
+var (
+	clientCert string
+	clientKey  string
+	token      string
+	timeout    int
+)
+
+func main() {
+	flag.StringVar(&clientCert, "client-cert", "", "")
+	flag.StringVar(&clientKey, "client-key", "", "")
+	flag.StringVar(&token, "token", "", "")
+	flag.IntVar(&timeout, "timeout", 10, "")
+
+	flag.Parse()
+
+	if clientCert == "" && clientKey == "" && token == "" {
+		tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
+		if err != nil {
+			panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err))
+		}
+		token = string(tokenByte)
+	}
+
+	c, err := client.NewKubeletClient(&client.KubeletClientConfig{
+		Address: "127.0.0.1",
+		Port:    10250,
+		TLSClientConfig: rest.TLSClientConfig{
+			Insecure:   true,
+			ServerName: "kubelet",
+			CertFile:   clientCert,
+			KeyFile:    clientKey,
+		},
+		BearerToken: token,
+		HTTPTimeout: time.Duration(timeout) * time.Second,
+	})
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	podsList, err := c.GetNodeRunningPods()
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	fmt.Println(podsList)
+}