synchronization

2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions
--- a/gpushare-device-plugin/pkg/gpu/nvidia/allocate.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/allocate.go
@@ -0,0 +1,198 @@
+package nvidia
+
+import (
+	"fmt"
+	"time"
+
+	log "github.com/golang/glog"
+	"golang.org/x/net/context"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/types"
+	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
+)
+
+var (
+	clientTimeout    = 30 * time.Second
+	lastAllocateTime time.Time
+)
+
+// create docker client
+func init() {
+	kubeInit()
+}
+
+func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
+	responses := pluginapi.AllocateResponse{}
+	for _, req := range reqs.ContainerRequests {
+		response := pluginapi.ContainerAllocateResponse{
+			Envs: map[string]string{
+				envNVGPU:               fmt.Sprintf("no-gpu-has-%d%s-to-run", podReqGPU, metric),
+				EnvResourceIndex:       fmt.Sprintf("-1"),
+				EnvResourceByPod:       fmt.Sprintf("%d", podReqGPU),
+				EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
+				EnvResourceByDev:       fmt.Sprintf("%d", getGPUMemory()),
+			},
+		}
+		responses.ContainerResponses = append(responses.ContainerResponses, &response)
+	}
+	return &responses
+}
+
+// Allocate which return list of devices.
+func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
+	reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
+	responses := pluginapi.AllocateResponse{}
+
+	log.Infoln("----Allocating GPU for gpu mem is started----")
+	var (
+		podReqGPU uint
+		found     bool
+		assumePod *v1.Pod
+	)
+
+	// podReqGPU = uint(0)
+	for _, req := range reqs.ContainerRequests {
+		podReqGPU += uint(len(req.DevicesIDs))
+	}
+	log.Infof("RequestPodGPUs: %d", podReqGPU)
+
+	m.Lock()
+	defer m.Unlock()
+	log.Infoln("checking...")
+	pods, err := getCandidatePods(m.queryKubelet, m.kubeletClient)
+	if err != nil {
+		log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err)
+		return buildErrResponse(reqs, podReqGPU), nil
+	}
+
+	if log.V(4) {
+		for _, pod := range pods {
+			log.Infof("Pod %s in ns %s request GPU Memory %d with timestamp %v",
+				pod.Name,
+				pod.Namespace,
+				getGPUMemoryFromPodResource(pod),
+				getAssumeTimeFromPodAnnotation(pod))
+		}
+	}
+
+	for _, pod := range pods {
+		if getGPUMemoryFromPodResource(pod) == podReqGPU {
+			log.Infof("Found Assumed GPU shared Pod %s in ns %s with GPU Memory %d",
+				pod.Name,
+				pod.Namespace,
+				podReqGPU)
+			assumePod = pod
+			found = true
+			break
+		}
+	}
+
+	if found {
+		id := getGPUIDFromPodAnnotation(assumePod)
+		if id < 0 {
+			log.Warningf("Failed to get the dev ", assumePod)
+		}
+
+		candidateDevID := ""
+		if id >= 0 {
+			ok := false
+			candidateDevID, ok = m.GetDeviceNameByIndex(uint(id))
+			if !ok {
+				log.Warningf("Failed to find the dev for pod %v because it's not able to find dev with index %d",
+					assumePod,
+					id)
+				id = -1
+			}
+		}
+
+		if id < 0 {
+			return buildErrResponse(reqs, podReqGPU), nil
+		}
+		log.Infof("gpu index %v,uuid: %v", id, candidateDevID)
+		// 1. Create container requests
+		for _, req := range reqs.ContainerRequests {
+			reqGPU := uint(len(req.DevicesIDs))
+			response := pluginapi.ContainerAllocateResponse{
+				Envs: map[string]string{
+					envNVGPU:               fmt.Sprintf("%v", id),
+					EnvResourceIndex:       fmt.Sprintf("%d", id),
+					EnvResourceByPod:       fmt.Sprintf("%d", podReqGPU),
+					EnvResourceByContainer: fmt.Sprintf("%d", reqGPU),
+					EnvResourceByDev:       fmt.Sprintf("%d", getGPUMemory()),
+				},
+			}
+			if m.disableCGPUIsolation {
+				response.Envs["CGPU_DISABLE"] = "true"
+			}
+			responses.ContainerResponses = append(responses.ContainerResponses, &response)
+		}
+
+		// 2. Update Pod spec
+		patchedAnnotationBytes, err := patchPodAnnotationSpecAssigned()
+		if err != nil {
+			return buildErrResponse(reqs, podReqGPU), nil
+		}
+		_, err = clientset.CoreV1().Pods(assumePod.Namespace).Patch(assumePod.Name, types.StrategicMergePatchType, patchedAnnotationBytes)
+		if err != nil {
+			// the object has been modified; please apply your changes to the latest version and try again
+			if err.Error() == OptimisticLockErrorMsg {
+				// retry
+				_, err = clientset.CoreV1().Pods(assumePod.Namespace).Patch(assumePod.Name, types.StrategicMergePatchType, patchedAnnotationBytes)
+				if err != nil {
+					log.Warningf("Failed due to %v", err)
+					return buildErrResponse(reqs, podReqGPU), nil
+				}
+			} else {
+				log.Warningf("Failed due to %v", err)
+				return buildErrResponse(reqs, podReqGPU), nil
+			}
+		}
+
+	} else if len(m.devNameMap) == 1 {
+		var devName string
+		var devIndex uint
+		for d, index := range m.devNameMap {
+			devName = d
+			devIndex = index
+			break
+		}
+		log.Infof("this node has only one gpu device,skip to search pod and directly specify the device  %v(%v) for container", devIndex, devName)
+		for _, req := range reqs.ContainerRequests {
+			reqGPU := uint(len(req.DevicesIDs))
+			response := pluginapi.ContainerAllocateResponse{
+				Envs: map[string]string{
+					envNVGPU:               devName,
+					EnvResourceIndex:       fmt.Sprintf("%d", devIndex),
+					EnvResourceByPod:       fmt.Sprintf("%d", podReqGPU),
+					EnvResourceByContainer: fmt.Sprintf("%d", reqGPU),
+					EnvResourceByDev:       fmt.Sprintf("%d", getGPUMemory()),
+				},
+			}
+			if m.disableCGPUIsolation {
+				response.Envs["CGPU_DISABLE"] = "true"
+			}
+			responses.ContainerResponses = append(responses.ContainerResponses, &response)
+		}
+		log.Infof("get allocated GPUs info %v", responses)
+		return &responses, nil
+
+	} else {
+		log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.",
+			podReqGPU)
+		// return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU)
+		return buildErrResponse(reqs, podReqGPU), nil
+	}
+
+	podName := ""
+	if assumePod != nil {
+		podName = assumePod.Name
+	}
+	log.Infof("pod %v, new allocated GPUs info %v", podName, &responses)
+	log.Infof("----Allocating GPU for gpu mem for %v is ended----", podName)
+	// // Add this to make sure the container is created at least
+	// currentTime := time.Now()
+
+	// currentTime.Sub(lastAllocateTime)
+
+	return &responses, nil
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/const.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/const.go
@@ -0,0 +1,36 @@
+package nvidia
+
+import (
+	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
+)
+
+// MemoryUnit describes GPU Memory, now only supports Gi, Mi
+type MemoryUnit string
+
+const (
+	resourceName  = "rainbond.com/gpu-mem"
+	resourceCount = "rainbond.com/gpu-count"
+	serverSock    = pluginapi.DevicePluginPath + "aliyungpushare.sock"
+
+	OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
+
+	allHealthChecks             = "xids"
+	containerTypeLabelKey       = "io.kubernetes.docker.type"
+	containerTypeLabelSandbox   = "podsandbox"
+	containerTypeLabelContainer = "container"
+	containerLogPathLabelKey    = "io.kubernetes.container.logpath"
+	sandboxIDLabelKey           = "io.kubernetes.sandbox.id"
+
+	envNVGPU                   = "NVIDIA_VISIBLE_DEVICES"
+	EnvResourceIndex           = "ALIYUN_COM_GPU_MEM_IDX"
+	EnvResourceByPod           = "ALIYUN_COM_GPU_MEM_POD"
+	EnvResourceByContainer     = "ALIYUN_COM_GPU_MEM_CONTAINER"
+	EnvResourceByDev           = "ALIYUN_COM_GPU_MEM_DEV"
+	EnvAssignedFlag            = "ALIYUN_COM_GPU_MEM_ASSIGNED"
+	EnvResourceAssumeTime      = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
+	EnvResourceAssignTime      = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
+	EnvNodeLabelForDisableCGPU = "cgpu.disable.isolation"
+
+	GiBPrefix = MemoryUnit("GiB")
+	MiBPrefix = MemoryUnit("MiB")
+)
--- a/gpushare-device-plugin/pkg/gpu/nvidia/coredump.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/coredump.go
@@ -0,0 +1,30 @@
+package nvidia
+
+import (
+	"io/ioutil"
+	"runtime"
+
+	log "github.com/golang/glog"
+)
+
+func StackTrace(all bool) string {
+	buf := make([]byte, 10240)
+
+	for {
+		size := runtime.Stack(buf, all)
+
+		if size == len(buf) {
+			buf = make([]byte, len(buf)<<1)
+			continue
+		}
+		break
+
+	}
+
+	return string(buf)
+}
+
+func coredump(fileName string) {
+	log.Infoln("Dump stacktrace to ", fileName)
+	ioutil.WriteFile(fileName, []byte(StackTrace(true)), 0644)
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/gpumanager.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/gpumanager.go
@@ -0,0 +1,111 @@
+package nvidia
+
+import (
+	"fmt"
+	"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
+	"syscall"
+	"os"
+	"time"
+
+	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
+	"github.com/fsnotify/fsnotify"
+	log "github.com/golang/glog"
+	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
+)
+
+type sharedGPUManager struct {
+	enableMPS     bool
+	healthCheck   bool
+	queryKubelet  bool
+	kubeletClient *client.KubeletClient
+}
+
+func NewSharedGPUManager(enableMPS, healthCheck, queryKubelet bool, bp MemoryUnit, client *client.KubeletClient) *sharedGPUManager {
+	metric = bp
+	return &sharedGPUManager{
+		enableMPS:     enableMPS,
+		healthCheck:   healthCheck,
+		queryKubelet:  queryKubelet,
+		kubeletClient: client,
+	}
+}
+
+func (ngm *sharedGPUManager) Run() error {
+	log.V(1).Infoln("Loading NVML")
+
+	if err := nvml.Init(); err != nil {
+		log.V(1).Infof("Failed to initialize NVML: %s.", err)
+		log.V(1).Infof("If this is a GPU node, did you set the docker default runtime to `nvidia`?")
+		select {}
+	}
+	defer func() { log.V(1).Infoln("Shutdown of NVML returned:", nvml.Shutdown()) }()
+
+	log.V(1).Infoln("Fetching devices.")
+	if getDeviceCount() == uint(0) {
+		log.V(1).Infoln("No devices found. Waiting indefinitely.")
+		select {}
+	}
+
+	log.V(1).Infoln("Starting FS watcher.")
+	watcher, err := newFSWatcher(pluginapi.DevicePluginPath)
+	if err != nil {
+		log.V(1).Infoln("Failed to created FS watcher.")
+		return err
+	}
+	defer watcher.Close()
+
+	log.V(1).Infoln("Starting OS watcher.")
+	sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
+
+	restart := true
+	var devicePlugin *NvidiaDevicePlugin
+
+L:
+	for {
+		if restart {
+			if devicePlugin != nil {
+				devicePlugin.Stop()
+			}
+
+			devicePlugin, err = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.queryKubelet, ngm.kubeletClient)
+			if err != nil {
+				log.Warningf("Failed to get device plugin due to %v", err)
+				os.Exit(1)
+			} else if err = devicePlugin.Serve(); err != nil {
+				log.Warningf("Failed to start device plugin due to %v", err)
+				os.Exit(2)
+			} else {
+				restart = false
+			}
+		}
+
+		select {
+		case event := <-watcher.Events:
+			if event.Name == pluginapi.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create {
+				log.V(1).Infof("inotify: %s created, restarting.", pluginapi.KubeletSocket)
+				restart = true
+			}
+
+		case err := <-watcher.Errors:
+			log.Warningf("inotify: %s", err)
+
+		case s := <-sigs:
+			switch s {
+			case syscall.SIGHUP:
+				log.V(1).Infoln("Received SIGHUP, restarting.")
+				restart = true
+			case syscall.SIGQUIT:
+				t := time.Now()
+				timestamp := fmt.Sprint(t.Format("20060102150405"))
+				log.Infoln("generate core dump")
+				coredump("/etc/kubernetes/go_" + timestamp + ".txt")
+			default:
+				log.V(1).Infof("Received signal \"%v\", shutting down.", s)
+				devicePlugin.Stop()
+				break L
+			}
+		}
+	}
+
+	return nil
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/nvidia.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/nvidia.go
@@ -0,0 +1,152 @@
+package nvidia
+
+import (
+	"fmt"
+	"strings"
+
+	log "github.com/golang/glog"
+
+	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
+
+	"golang.org/x/net/context"
+	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
+)
+
+var (
+	gpuMemory uint
+	metric    MemoryUnit
+)
+
+func check(err error) {
+	if err != nil {
+		log.Fatalln("Fatal:", err)
+	}
+}
+
+func generateFakeDeviceID(realID string, fakeCounter uint) string {
+	return fmt.Sprintf("%s-_-%d", realID, fakeCounter)
+}
+
+func extractRealDeviceID(fakeDeviceID string) string {
+	return strings.Split(fakeDeviceID, "-_-")[0]
+}
+
+func setGPUMemory(raw uint) {
+	v := raw
+	if metric == GiBPrefix {
+		v = raw / 1024
+	}
+	gpuMemory = v
+	log.Infof("set gpu memory: %d", gpuMemory)
+}
+
+func getGPUMemory() uint {
+	return gpuMemory
+}
+
+func getDeviceCount() uint {
+	n, err := nvml.GetDeviceCount()
+	check(err)
+	return n
+}
+
+func getDevices() ([]*pluginapi.Device, map[string]uint) {
+	n, err := nvml.GetDeviceCount()
+	check(err)
+
+	var devs []*pluginapi.Device
+	realDevNames := map[string]uint{}
+	for i := uint(0); i < n; i++ {
+		d, err := nvml.NewDevice(i)
+		check(err)
+		// realDevNames = append(realDevNames, d.UUID)
+		var id uint
+		log.Infof("Deivce %s's Path is %s", d.UUID, d.Path)
+		_, err = fmt.Sscanf(d.Path, "/dev/nvidia%d", &id)
+		check(err)
+		realDevNames[d.UUID] = id
+		// var KiB uint64 = 1024
+		log.Infof("# device Memory: %d", uint(*d.Memory))
+		if getGPUMemory() == uint(0) {
+			setGPUMemory(uint(*d.Memory))
+		}
+		for j := uint(0); j < getGPUMemory(); j++ {
+			fakeID := generateFakeDeviceID(d.UUID, j)
+			if j == 0 {
+				log.Infoln("# Add first device ID: " + fakeID)
+			}
+			if j == getGPUMemory()-1 {
+				log.Infoln("# Add last device ID: " + fakeID)
+			}
+			devs = append(devs, &pluginapi.Device{
+				ID:     fakeID,
+				Health: pluginapi.Healthy,
+			})
+		}
+	}
+
+	return devs, realDevNames
+}
+
+func deviceExists(devs []*pluginapi.Device, id string) bool {
+	for _, d := range devs {
+		if d.ID == id {
+			return true
+		}
+	}
+	return false
+}
+
+func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
+	eventSet := nvml.NewEventSet()
+	defer nvml.DeleteEventSet(eventSet)
+
+	for _, d := range devs {
+		realDeviceID := extractRealDeviceID(d.ID)
+		err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, realDeviceID)
+		if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
+			log.Infof("Warning: %s (%s) is too old to support healthchecking: %s. Marking it unhealthy.", realDeviceID, d.ID, err)
+
+			xids <- d
+			continue
+		}
+
+		if err != nil {
+			log.Fatalf("Fatal error:", err)
+		}
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+		}
+
+		e, err := nvml.WaitForEvent(eventSet, 5000)
+		if err != nil && e.Etype != nvml.XidCriticalError {
+			continue
+		}
+
+		// FIXME: formalize the full list and document it.
+		// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
+		// Application errors: the GPU should still be healthy
+		if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
+			continue
+		}
+
+		if e.UUID == nil || len(*e.UUID) == 0 {
+			// All devices are unhealthy
+			for _, d := range devs {
+				xids <- d
+			}
+			continue
+		}
+
+		for _, d := range devs {
+			if extractRealDeviceID(d.ID) == *e.UUID {
+				xids <- d
+			}
+		}
+	}
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/podmanager.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/podmanager.go
@@ -0,0 +1,262 @@
+package nvidia
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
+	log "github.com/golang/glog"
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/fields"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/clientcmd"
+	nodeutil "k8s.io/kubernetes/pkg/util/node"
+	"os"
+	"sort"
+	"time"
+)
+
+var (
+	clientset *kubernetes.Clientset
+	nodeName  string
+	retries   = 8
+)
+
+func kubeInit() {
+	kubeconfigFile := os.Getenv("KUBECONFIG")
+	var err error
+	var config *rest.Config
+
+	if _, err = os.Stat(kubeconfigFile); err != nil {
+		log.V(5).Infof("kubeconfig %s failed to find due to %v", kubeconfigFile, err)
+		config, err = rest.InClusterConfig()
+		if err != nil {
+			log.Fatalf("Failed due to %v", err)
+		}
+	} else {
+		config, err = clientcmd.BuildConfigFromFlags("", kubeconfigFile)
+		if err != nil {
+			log.Fatalf("Failed due to %v", err)
+		}
+	}
+
+	clientset, err = kubernetes.NewForConfig(config)
+	if err != nil {
+		log.Fatalf("Failed due to %v", err)
+	}
+
+	nodeName = os.Getenv("NODE_NAME")
+	if nodeName == "" {
+		log.Fatalln("Please set env NODE_NAME")
+	}
+
+}
+
+func disableCGPUIsolationOrNot() (bool, error) {
+	disable := false
+	node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
+	if err != nil {
+		return disable, err
+	}
+	labels := node.ObjectMeta.Labels
+	value, ok := labels[EnvNodeLabelForDisableCGPU]
+	if ok && value == "true" {
+		log.Infof("enable gpusharing mode and disable cgpu mode")
+		disable = true
+	}
+	return disable, nil
+}
+
+func patchGPUCount(gpuCount int) error {
+	node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
+	if err != nil {
+		return err
+	}
+
+	if val, ok := node.Status.Capacity[resourceCount]; ok {
+		if val.Value() == int64(gpuCount) {
+			log.Infof("No need to update Capacity %s", resourceCount)
+			return nil
+		}
+	}
+
+	newNode := node.DeepCopy()
+	newNode.Status.Capacity[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
+	newNode.Status.Allocatable[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
+	// content := fmt.Sprintf(`[{"op": "add", "path": "/status/capacity/aliyun.com~gpu-count", "value": "%d"}]`, gpuCount)
+	// _, err = clientset.CoreV1().Nodes().PatchStatus(nodeName, []byte(content))
+	_, _, err = nodeutil.PatchNodeStatus(clientset.CoreV1(), types.NodeName(nodeName), node, newNode)
+	if err != nil {
+		log.Infof("Failed to update Capacity %s.", resourceCount)
+	} else {
+		log.Infof("Updated Capacity %s successfully.", resourceCount)
+	}
+	return err
+}
+
+func getPodList(kubeletClient *client.KubeletClient) (*v1.PodList, error) {
+	podList, err := kubeletClient.GetNodeRunningPods()
+	if err != nil {
+		return nil, err
+	}
+
+	list, _ := json.Marshal(podList)
+	log.V(8).Infof("get pods list %v", string(list))
+
+	resultPodList := &v1.PodList{}
+	for _, metaPod := range podList.Items {
+		if metaPod.Status.Phase != v1.PodPending {
+			continue
+		}
+		resultPodList.Items = append(resultPodList.Items, metaPod)
+	}
+
+	if len(resultPodList.Items) == 0 {
+		return nil, fmt.Errorf("not found pending pod")
+	}
+
+	return resultPodList, nil
+}
+
+func getPodListsByQueryKubelet(kubeletClient *client.KubeletClient) (*v1.PodList, error) {
+	podList, err := getPodList(kubeletClient)
+	for i := 0; i < retries && err != nil; i++ {
+		podList, err = getPodList(kubeletClient)
+		log.Warningf("failed to get pending pod list, retry")
+		time.Sleep(100 * time.Millisecond)
+	}
+	if err != nil {
+		log.Warningf("not found from kubelet /pods api, start to list apiserver")
+		podList, err = getPodListsByListAPIServer()
+		if err != nil {
+			return nil, err
+		}
+	}
+	return podList, nil
+}
+
+func getPodListsByListAPIServer() (*v1.PodList, error) {
+	selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName, "status.phase": "Pending"})
+	podList, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
+		FieldSelector: selector.String(),
+		LabelSelector: labels.Everything().String(),
+	})
+	for i := 0; i < 3 && err != nil; i++ {
+		podList, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
+			FieldSelector: selector.String(),
+			LabelSelector: labels.Everything().String(),
+		})
+		time.Sleep(1 * time.Second)
+	}
+	if err != nil {
+		return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName)
+	}
+
+	return podList, nil
+}
+
+func getPendingPodsInNode(queryKubelet bool, kubeletClient *client.KubeletClient) ([]v1.Pod, error) {
+	// pods, err := m.lister.List(labels.Everything())
+	// if err != nil {
+	// 	return nil, err
+	// }
+	pods := []v1.Pod{}
+
+	podIDMap := map[types.UID]bool{}
+
+	var podList *v1.PodList
+	var err error
+	if queryKubelet {
+		podList, err = getPodListsByQueryKubelet(kubeletClient)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		podList, err = getPodListsByListAPIServer()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.V(5).Infof("all pod list %v", podList.Items)
+
+	// if log.V(5) {
+	for _, pod := range podList.Items {
+		if pod.Spec.NodeName != nodeName {
+			log.Warningf("Pod name %s in ns %s is not assigned to node %s as expected, it's placed on node %s ",
+				pod.Name,
+				pod.Namespace,
+				nodeName,
+				pod.Spec.NodeName)
+		} else {
+			log.Infof("list pod %s in ns %s in node %s and status is %s",
+				pod.Name,
+				pod.Namespace,
+				nodeName,
+				pod.Status.Phase,
+			)
+			if _, ok := podIDMap[pod.UID]; !ok {
+				pods = append(pods, pod)
+				podIDMap[pod.UID] = true
+			}
+		}
+
+	}
+	// }
+
+	return pods, nil
+}
+
+// pick up the gpushare pod with assigned status is false, and
+func getCandidatePods(queryKubelet bool, client *client.KubeletClient) ([]*v1.Pod, error) {
+	candidatePods := []*v1.Pod{}
+	allPods, err := getPendingPodsInNode(queryKubelet, client)
+	if err != nil {
+		return candidatePods, err
+	}
+	for _, pod := range allPods {
+		current := pod
+		if isGPUMemoryAssumedPod(&current) {
+			candidatePods = append(candidatePods, &current)
+		}
+	}
+
+	if log.V(4) {
+		for _, pod := range candidatePods {
+			log.Infof("candidate pod %s in ns %s with timestamp %d is found.",
+				pod.Name,
+				pod.Namespace,
+				getAssumeTimeFromPodAnnotation(pod))
+		}
+	}
+
+	return makePodOrderdByAge(candidatePods), nil
+}
+
+// make the pod ordered by GPU assumed time
+func makePodOrderdByAge(pods []*v1.Pod) []*v1.Pod {
+	newPodList := make(orderedPodByAssumeTime, 0, len(pods))
+	for _, v := range pods {
+		newPodList = append(newPodList, v)
+	}
+	sort.Sort(newPodList)
+	return []*v1.Pod(newPodList)
+}
+
+type orderedPodByAssumeTime []*v1.Pod
+
+func (this orderedPodByAssumeTime) Len() int {
+	return len(this)
+}
+
+func (this orderedPodByAssumeTime) Less(i, j int) bool {
+	return getAssumeTimeFromPodAnnotation(this[i]) <= getAssumeTimeFromPodAnnotation(this[j])
+}
+
+func (this orderedPodByAssumeTime) Swap(i, j int) {
+	this[i], this[j] = this[j], this[i]
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/podutils.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/podutils.go
@@ -0,0 +1,182 @@
+package nvidia
+
+import (
+	"encoding/json"
+	"fmt"
+	"strconv"
+	"time"
+
+	log "github.com/golang/glog"
+	v1 "k8s.io/api/core/v1"
+)
+
+// update pod env with assigned status
+func updatePodAnnotations(oldPod *v1.Pod) (newPod *v1.Pod) {
+	newPod = oldPod.DeepCopy()
+	if len(newPod.ObjectMeta.Annotations) == 0 {
+		newPod.ObjectMeta.Annotations = map[string]string{}
+	}
+
+	now := time.Now()
+	newPod.ObjectMeta.Annotations[EnvAssignedFlag] = "true"
+	newPod.ObjectMeta.Annotations[EnvResourceAssumeTime] = fmt.Sprintf("%d", now.UnixNano())
+
+	return newPod
+}
+
+func patchPodAnnotationSpecAssigned() ([]byte, error) {
+	now := time.Now()
+	patchAnnotations := map[string]interface{}{
+		"metadata": map[string]map[string]string{"annotations": {
+			EnvAssignedFlag:       "true",
+			EnvResourceAssumeTime: fmt.Sprintf("%d", now.UnixNano()),
+		}}}
+	return json.Marshal(patchAnnotations)
+}
+
+func getGPUIDFromPodAnnotation(pod *v1.Pod) (id int) {
+	var err error
+	id = -1
+
+	if len(pod.ObjectMeta.Annotations) > 0 {
+		value, found := pod.ObjectMeta.Annotations[EnvResourceIndex]
+		if found {
+			id, err = strconv.Atoi(value)
+			if err != nil {
+				log.Warningf("Failed to parse dev id %s due to %v for pod %s in ns %s",
+					value,
+					err,
+					pod.Name,
+					pod.Namespace)
+				id = -1
+			}
+		} else {
+			log.Warningf("Failed to get dev id %s for pod %s in ns %s",
+				pod.Name,
+				pod.Namespace)
+		}
+	}
+
+	return id
+}
+
+// get assumed timestamp
+func getAssumeTimeFromPodAnnotation(pod *v1.Pod) (assumeTime uint64) {
+	if assumeTimeStr, ok := pod.ObjectMeta.Annotations[EnvResourceAssumeTime]; ok {
+		u64, err := strconv.ParseUint(assumeTimeStr, 10, 64)
+		if err != nil {
+			log.Warningf("Failed to parse assume Timestamp %s due to %v", assumeTimeStr, err)
+		} else {
+			assumeTime = u64
+		}
+	}
+
+	return assumeTime
+}
+
+// determine if the pod is GPU share pod, and is already assumed but not assigned
+func isGPUMemoryAssumedPod(pod *v1.Pod) (assumed bool) {
+	log.V(6).Infof("Determine if the pod %v is GPUSharedAssumed pod", pod)
+	var ok bool
+
+	// 1. Check if it's for GPU share
+	if getGPUMemoryFromPodResource(pod) <= 0 {
+		log.V(6).Infof("Pod %s in namespace %s has not GPU Memory Request, so it's not GPUSharedAssumed assumed pod.",
+			pod.Name,
+			pod.Namespace)
+		return assumed
+	}
+
+	// 2. Check if it already has assume time
+	if _, ok = pod.ObjectMeta.Annotations[EnvResourceAssumeTime]; !ok {
+		log.V(4).Infof("No assume timestamp for pod %s in namespace %s, so it's not GPUSharedAssumed assumed pod.",
+			pod.Name,
+			pod.Namespace)
+		return assumed
+	}
+
+	// 3. Check if it has been assigned already
+	if assigned, ok := pod.ObjectMeta.Annotations[EnvAssignedFlag]; ok {
+
+		if assigned == "false" {
+			log.V(4).Infof("Found GPUSharedAssumed assumed pod %s in namespace %s.",
+				pod.Name,
+				pod.Namespace)
+			assumed = true
+		} else {
+			log.Infof("GPU assigned Flag for pod %s exists in namespace %s and its assigned status is %s, so it's not GPUSharedAssumed assumed pod.",
+				pod.Name,
+				pod.Namespace,
+				assigned)
+		}
+	} else {
+		log.Warningf("No GPU assigned Flag for pod %s in namespace %s, so it's not GPUSharedAssumed assumed pod.",
+			pod.Name,
+			pod.Namespace)
+	}
+
+	return assumed
+}
+
+// Get GPU Memory of the Pod
+func getGPUMemoryFromPodResource(pod *v1.Pod) uint {
+	var total uint
+	containers := pod.Spec.Containers
+	for _, container := range containers {
+		if val, ok := container.Resources.Limits[resourceName]; ok {
+			total += uint(val.Value())
+		}
+	}
+	return total
+}
+
+func podIsNotRunning(pod v1.Pod) bool {
+	status := pod.Status
+	//deletionTimestamp
+	if pod.DeletionTimestamp != nil {
+		return true
+	}
+
+	// pod is scheduled but not initialized
+	if status.Phase == v1.PodPending && podConditionTrueOnly(status.Conditions, v1.PodScheduled) {
+		log.Infof("Pod %s only has PodScheduled, is not running", pod.Name)
+		return true
+	}
+
+	return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses)) || (status.Phase == v1.PodPending && podConditionTrueOnly(status.Conditions, v1.PodScheduled))
+}
+
+// notRunning returns true if every status is terminated or waiting, or the status list
+// is empty.
+func notRunning(statuses []v1.ContainerStatus) bool {
+	for _, status := range statuses {
+		if status.State.Terminated == nil && status.State.Waiting == nil {
+			return false
+		}
+	}
+	return true
+}
+
+func podConditionTrue(conditions []v1.PodCondition, expect v1.PodConditionType) bool {
+	for _, condition := range conditions {
+		if condition.Type == expect && condition.Status == v1.ConditionTrue {
+			return true
+		}
+	}
+
+	return false
+}
+
+func podConditionTrueOnly(conditions []v1.PodCondition, expect v1.PodConditionType) bool {
+	if len(conditions) != 1 {
+		return false
+	}
+
+	for _, condition := range conditions {
+		if condition.Type == expect && condition.Status == v1.ConditionTrue {
+			return true
+		}
+	}
+
+	return false
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/server.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/server.go
@@ -0,0 +1,241 @@
+package nvidia
+
+import (
+	"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
+	"net"
+	"os"
+	"path"
+	"sync"
+	"time"
+
+	log "github.com/golang/glog"
+
+	"golang.org/x/net/context"
+	"google.golang.org/grpc"
+	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
+)
+
+// NvidiaDevicePlugin implements the Kubernetes device plugin API
+type NvidiaDevicePlugin struct {
+	devs                 []*pluginapi.Device
+	realDevNames         []string
+	devNameMap           map[string]uint
+	devIndxMap           map[uint]string
+	socket               string
+	mps                  bool
+	healthCheck          bool
+	disableCGPUIsolation bool
+	stop                 chan struct{}
+	health               chan *pluginapi.Device
+	queryKubelet         bool
+	kubeletClient        *client.KubeletClient
+
+	server *grpc.Server
+	sync.RWMutex
+}
+
+// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
+func NewNvidiaDevicePlugin(mps, healthCheck, queryKubelet bool, client *client.KubeletClient) (*NvidiaDevicePlugin, error) {
+	devs, devNameMap := getDevices()
+	devList := []string{}
+
+	for dev, _ := range devNameMap {
+		devList = append(devList, dev)
+	}
+
+	log.Infof("Device Map: %v", devNameMap)
+	log.Infof("Device List: %v", devList)
+
+	err := patchGPUCount(len(devList))
+	if err != nil {
+		return nil, err
+	}
+	disableCGPUIsolation, err := disableCGPUIsolationOrNot()
+	if err != nil {
+		return nil, err
+	}
+	return &NvidiaDevicePlugin{
+		devs:                 devs,
+		realDevNames:         devList,
+		devNameMap:           devNameMap,
+		socket:               serverSock,
+		mps:                  mps,
+		healthCheck:          healthCheck,
+		disableCGPUIsolation: disableCGPUIsolation,
+		stop:                 make(chan struct{}),
+		health:               make(chan *pluginapi.Device),
+		queryKubelet:         queryKubelet,
+		kubeletClient:        client,
+	}, nil
+}
+
+func (m *NvidiaDevicePlugin) GetDeviceNameByIndex(index uint) (name string, found bool) {
+	if len(m.devIndxMap) == 0 {
+		m.devIndxMap = map[uint]string{}
+		for k, v := range m.devNameMap {
+			m.devIndxMap[v] = k
+		}
+		log.Infof("Get devIndexMap: %v", m.devIndxMap)
+	}
+
+	name, found = m.devIndxMap[index]
+	return name, found
+}
+
+func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
+	return &pluginapi.DevicePluginOptions{}, nil
+}
+
+// dial establishes the gRPC communication with the registered device plugin.
+func dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
+	c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
+		grpc.WithTimeout(timeout),
+		grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
+			return net.DialTimeout("unix", addr, timeout)
+		}),
+	)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return c, nil
+}
+
+// Start starts the gRPC server of the device plugin
+func (m *NvidiaDevicePlugin) Start() error {
+	err := m.cleanup()
+	if err != nil {
+		return err
+	}
+
+	sock, err := net.Listen("unix", m.socket)
+	if err != nil {
+		return err
+	}
+
+	m.server = grpc.NewServer([]grpc.ServerOption{}...)
+	pluginapi.RegisterDevicePluginServer(m.server, m)
+
+	go m.server.Serve(sock)
+
+	// Wait for server to start by launching a blocking connexion
+	conn, err := dial(m.socket, 5*time.Second)
+	if err != nil {
+		return err
+	}
+	conn.Close()
+
+	go m.healthcheck()
+
+	lastAllocateTime = time.Now()
+
+	return nil
+}
+
+// Stop stops the gRPC server
+func (m *NvidiaDevicePlugin) Stop() error {
+	if m.server == nil {
+		return nil
+	}
+
+	m.server.Stop()
+	m.server = nil
+	close(m.stop)
+
+	return m.cleanup()
+}
+
+// Register registers the device plugin for the given resourceName with Kubelet.
+func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
+	conn, err := dial(kubeletEndpoint, 5*time.Second)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	client := pluginapi.NewRegistrationClient(conn)
+	reqt := &pluginapi.RegisterRequest{
+		Version:      pluginapi.Version,
+		Endpoint:     path.Base(m.socket),
+		ResourceName: resourceName,
+	}
+
+	_, err = client.Register(context.Background(), reqt)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// ListAndWatch lists devices and update that list according to the health status
+func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
+	s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
+
+	for {
+		select {
+		case <-m.stop:
+			return nil
+		case d := <-m.health:
+			// FIXME: there is no way to recover from the Unhealthy state.
+			d.Health = pluginapi.Unhealthy
+			s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
+		}
+	}
+}
+
+func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
+	m.health <- dev
+}
+
+func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
+	return &pluginapi.PreStartContainerResponse{}, nil
+}
+
+func (m *NvidiaDevicePlugin) cleanup() error {
+	if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+
+	return nil
+}
+
+func (m *NvidiaDevicePlugin) healthcheck() {
+	ctx, cancel := context.WithCancel(context.Background())
+
+	var xids chan *pluginapi.Device
+	if m.healthCheck {
+		xids = make(chan *pluginapi.Device)
+		go watchXIDs(ctx, m.devs, xids)
+	}
+
+	for {
+		select {
+		case <-m.stop:
+			cancel()
+			return
+		case dev := <-xids:
+			m.unhealthy(dev)
+		}
+	}
+}
+
+// Serve starts the gRPC server and register the device plugin to Kubelet
+func (m *NvidiaDevicePlugin) Serve() error {
+	err := m.Start()
+	if err != nil {
+		log.Infof("Could not start device plugin: %s", err)
+		return err
+	}
+	log.Infoln("Starting to serve on", m.socket)
+
+	err = m.Register(pluginapi.KubeletSocket, resourceName)
+	if err != nil {
+		log.Infof("Could not register device plugin: %s", err)
+		m.Stop()
+		return err
+	}
+	log.Infoln("Registered device plugin with Kubelet")
+
+	return nil
+}
--- a/gpushare-device-plugin/pkg/gpu/nvidia/watchers.go
+++ b/gpushare-device-plugin/pkg/gpu/nvidia/watchers.go
@@ -0,0 +1,32 @@
+package nvidia
+
+import (
+	"os"
+	"os/signal"
+
+	"github.com/fsnotify/fsnotify"
+)
+
+func newFSWatcher(files ...string) (*fsnotify.Watcher, error) {
+	watcher, err := fsnotify.NewWatcher()
+	if err != nil {
+		return nil, err
+	}
+
+	for _, f := range files {
+		err = watcher.Add(f)
+		if err != nil {
+			watcher.Close()
+			return nil, err
+		}
+	}
+
+	return watcher, nil
+}
+
+func newOSWatcher(sigs ...os.Signal) chan os.Signal {
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, sigs...)
+
+	return sigChan
+}