synchronization
This commit is contained in:
		
							
								
								
									
										177
									
								
								gpushare-scheduler-extender/pkg/cache/cache.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										177
									
								
								gpushare-scheduler-extender/pkg/cache/cache.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,177 @@
 | 
			
		||||
package cache
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	"sync"
 | 
			
		||||
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/labels"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
	corelisters "k8s.io/client-go/listers/core/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type SchedulerCache struct {
 | 
			
		||||
 | 
			
		||||
	// a map from pod key to podState.
 | 
			
		||||
	nodes map[string]*NodeInfo
 | 
			
		||||
 | 
			
		||||
	// nodeLister can list/get nodes from the shared informer's store.
 | 
			
		||||
	nodeLister corelisters.NodeLister
 | 
			
		||||
 | 
			
		||||
	//
 | 
			
		||||
	podLister corelisters.PodLister
 | 
			
		||||
 | 
			
		||||
	// record the knownPod, it will be added when annotation ALIYUN_GPU_ID is added, and will be removed when complete and deleted
 | 
			
		||||
	knownPods map[types.UID]*v1.Pod
 | 
			
		||||
	nLock     *sync.RWMutex
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewSchedulerCache(nLister corelisters.NodeLister, pLister corelisters.PodLister) *SchedulerCache {
 | 
			
		||||
	return &SchedulerCache{
 | 
			
		||||
		nodes:      make(map[string]*NodeInfo),
 | 
			
		||||
		nodeLister: nLister,
 | 
			
		||||
		podLister:  pLister,
 | 
			
		||||
		knownPods:  make(map[types.UID]*v1.Pod),
 | 
			
		||||
		nLock:      new(sync.RWMutex),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (cache *SchedulerCache) GetNodeinfos() []*NodeInfo {
 | 
			
		||||
	nodes := []*NodeInfo{}
 | 
			
		||||
	for _, n := range cache.nodes {
 | 
			
		||||
		nodes = append(nodes, n)
 | 
			
		||||
	}
 | 
			
		||||
	return nodes
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// build cache when initializing
 | 
			
		||||
func (cache *SchedulerCache) BuildCache() error {
 | 
			
		||||
	log.V(5).Info("debug: begin to build scheduler cache")
 | 
			
		||||
	pods, err := cache.podLister.List(labels.Everything())
 | 
			
		||||
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	} else {
 | 
			
		||||
		for _, pod := range pods {
 | 
			
		||||
			if utils.GetGPUMemoryFromPodAnnotation(pod) <= uint(0) {
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			if len(pod.Spec.NodeName) == 0 {
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			err = cache.AddOrUpdatePod(pod)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (cache *SchedulerCache) GetPod(name, namespace string) (*v1.Pod, error) {
 | 
			
		||||
	return cache.podLister.Pods(namespace).Get(name)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get known pod from the pod UID
 | 
			
		||||
func (cache *SchedulerCache) KnownPod(podUID types.UID) bool {
 | 
			
		||||
	cache.nLock.RLock()
 | 
			
		||||
	defer cache.nLock.RUnlock()
 | 
			
		||||
 | 
			
		||||
	_, found := cache.knownPods[podUID]
 | 
			
		||||
	return found
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (cache *SchedulerCache) AddOrUpdatePod(pod *v1.Pod) error {
 | 
			
		||||
	log.V(100).Info("debug: Add or update pod info: %v", pod)
 | 
			
		||||
	log.V(100).Info("debug: Node %v", cache.nodes)
 | 
			
		||||
	if len(pod.Spec.NodeName) == 0 {
 | 
			
		||||
		log.V(100).Info("debug: pod %s in ns %s is not assigned to any node, skip", pod.Name, pod.Namespace)
 | 
			
		||||
		return nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	n, err := cache.GetNodeInfo(pod.Spec.NodeName)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	podCopy := pod.DeepCopy()
 | 
			
		||||
	if n.addOrUpdatePod(podCopy) {
 | 
			
		||||
		// put it into known pod
 | 
			
		||||
		cache.rememberPod(pod.UID, podCopy)
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(100).Info("debug: pod %s in ns %s's gpu id is %d, it's illegal, skip",
 | 
			
		||||
			pod.Name,
 | 
			
		||||
			pod.Namespace,
 | 
			
		||||
			utils.GetGPUIDFromAnnotation(pod))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// The lock is in cacheNode
 | 
			
		||||
func (cache *SchedulerCache) RemovePod(pod *v1.Pod) {
 | 
			
		||||
	log.V(100).Info("debug: Remove pod info: %v", pod)
 | 
			
		||||
	log.V(100).Info("debug: Node %v", cache.nodes)
 | 
			
		||||
	n, err := cache.GetNodeInfo(pod.Spec.NodeName)
 | 
			
		||||
	if err == nil {
 | 
			
		||||
		n.removePod(pod)
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(10).Info("debug: Failed to get node %s due to %v", pod.Spec.NodeName, err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	cache.forgetPod(pod.UID)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get or build nodeInfo if it doesn't exist
 | 
			
		||||
func (cache *SchedulerCache) GetNodeInfo(name string) (*NodeInfo, error) {
 | 
			
		||||
	node, err := cache.nodeLister.Get(name)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	cache.nLock.Lock()
 | 
			
		||||
	defer cache.nLock.Unlock()
 | 
			
		||||
	n, ok := cache.nodes[name]
 | 
			
		||||
 | 
			
		||||
	if !ok {
 | 
			
		||||
		n = NewNodeInfo(node)
 | 
			
		||||
		cache.nodes[name] = n
 | 
			
		||||
	} else {
 | 
			
		||||
		// if the existing node turn from non gpushare to gpushare
 | 
			
		||||
		// if (utils.GetTotalGPUMemory(n.node) <= 0 && utils.GetTotalGPUMemory(node) > 0) ||
 | 
			
		||||
		// 	(utils.GetGPUCountInNode(n.node) <= 0 && utils.GetGPUCountInNode(node) > 0) ||
 | 
			
		||||
		// 	// if the existing node turn from gpushare to non gpushare
 | 
			
		||||
		// 	(utils.GetTotalGPUMemory(n.node) > 0 && utils.GetTotalGPUMemory(node) <= 0) ||
 | 
			
		||||
		// 	(utils.GetGPUCountInNode(n.node) > 0 && utils.GetGPUCountInNode(node) <= 0) {
 | 
			
		||||
		if len(cache.nodes[name].devs) == 0 ||
 | 
			
		||||
			utils.GetTotalGPUMemory(n.node) <= 0 ||
 | 
			
		||||
			utils.GetGPUCountInNode(n.node) <= 0 {
 | 
			
		||||
			log.V(10).Info("info: GetNodeInfo() need update node %s",
 | 
			
		||||
				name)
 | 
			
		||||
 | 
			
		||||
			// fix the scenario that the number of devices changes from 0 to an positive number
 | 
			
		||||
			cache.nodes[name].Reset(node)
 | 
			
		||||
			log.V(10).Info("info: node: %s, labels from cache after been updated: %v", n.node.Name, n.node.Labels)
 | 
			
		||||
		} else {
 | 
			
		||||
			log.V(10).Info("info: GetNodeInfo() uses the existing nodeInfo for %s", name)
 | 
			
		||||
		}
 | 
			
		||||
		log.V(100).Info("debug: node %s with devices %v", name, n.devs)
 | 
			
		||||
	}
 | 
			
		||||
	return n, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (cache *SchedulerCache) forgetPod(uid types.UID) {
 | 
			
		||||
	cache.nLock.Lock()
 | 
			
		||||
	defer cache.nLock.Unlock()
 | 
			
		||||
	delete(cache.knownPods, uid)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (cache *SchedulerCache) rememberPod(uid types.UID, pod *v1.Pod) {
 | 
			
		||||
	cache.nLock.Lock()
 | 
			
		||||
	defer cache.nLock.Unlock()
 | 
			
		||||
	cache.knownPods[pod.UID] = pod
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										33
									
								
								gpushare-scheduler-extender/pkg/cache/configmap.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								gpushare-scheduler-extender/pkg/cache/configmap.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,33 @@
 | 
			
		||||
package cache
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	apierrors "k8s.io/apimachinery/pkg/api/errors"
 | 
			
		||||
	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 | 
			
		||||
	corelisters "k8s.io/client-go/listers/core/v1"
 | 
			
		||||
	clientgocache "k8s.io/client-go/tools/cache"
 | 
			
		||||
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	ConfigMapLister         corelisters.ConfigMapLister
 | 
			
		||||
	ConfigMapInformerSynced clientgocache.InformerSynced
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func getConfigMap(name string) *v1.ConfigMap {
 | 
			
		||||
	configMap, err := ConfigMapLister.ConfigMaps(metav1.NamespaceSystem).Get(name)
 | 
			
		||||
 | 
			
		||||
	// If we can't get the configmap just return nil. The resync will eventually
 | 
			
		||||
	// sync things up.
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		if !apierrors.IsNotFound(err) {
 | 
			
		||||
			log.V(10).Info("warn: find configmap with error: %v", err)
 | 
			
		||||
			utilruntime.HandleError(err)
 | 
			
		||||
		}
 | 
			
		||||
		return nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return configMap
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										80
									
								
								gpushare-scheduler-extender/pkg/cache/deviceinfo.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								gpushare-scheduler-extender/pkg/cache/deviceinfo.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,80 @@
 | 
			
		||||
package cache
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	"sync"
 | 
			
		||||
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type DeviceInfo struct {
 | 
			
		||||
	idx    int
 | 
			
		||||
	podMap map[types.UID]*v1.Pod
 | 
			
		||||
	// usedGPUMem  uint
 | 
			
		||||
	totalGPUMem uint
 | 
			
		||||
	rwmu        *sync.RWMutex
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) GetPods() []*v1.Pod {
 | 
			
		||||
	pods := []*v1.Pod{}
 | 
			
		||||
	for _, pod := range d.podMap {
 | 
			
		||||
		pods = append(pods, pod)
 | 
			
		||||
	}
 | 
			
		||||
	return pods
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func newDeviceInfo(index int, totalGPUMem uint) *DeviceInfo {
 | 
			
		||||
	return &DeviceInfo{
 | 
			
		||||
		idx:         index,
 | 
			
		||||
		totalGPUMem: totalGPUMem,
 | 
			
		||||
		podMap:      map[types.UID]*v1.Pod{},
 | 
			
		||||
		rwmu:        new(sync.RWMutex),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) GetTotalGPUMemory() uint {
 | 
			
		||||
	return d.totalGPUMem
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) GetUsedGPUMemory() (gpuMem uint) {
 | 
			
		||||
	log.V(100).Info("debug: GetUsedGPUMemory() podMap %v, and its address is %p", d.podMap, d)
 | 
			
		||||
	d.rwmu.RLock()
 | 
			
		||||
	defer d.rwmu.RUnlock()
 | 
			
		||||
	for _, pod := range d.podMap {
 | 
			
		||||
		if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 | 
			
		||||
			log.V(100).Info("debug: skip the pod %s in ns %s due to its status is %s", pod.Name, pod.Namespace, pod.Status.Phase)
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		// gpuMem += utils.GetGPUMemoryFromPodEnv(pod)
 | 
			
		||||
		gpuMem += utils.GetGPUMemoryFromPodAnnotation(pod)
 | 
			
		||||
	}
 | 
			
		||||
	return gpuMem
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) addPod(pod *v1.Pod) {
 | 
			
		||||
	log.V(100).Info("debug: dev.addPod() Pod %s in ns %s with the GPU ID %d will be added to device map",
 | 
			
		||||
		pod.Name,
 | 
			
		||||
		pod.Namespace,
 | 
			
		||||
		d.idx)
 | 
			
		||||
	d.rwmu.Lock()
 | 
			
		||||
	defer d.rwmu.Unlock()
 | 
			
		||||
	d.podMap[pod.UID] = pod
 | 
			
		||||
	log.V(100).Info("debug: dev.addPod() after updated is %v, and its address is %p",
 | 
			
		||||
		d.podMap,
 | 
			
		||||
		d)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (d *DeviceInfo) removePod(pod *v1.Pod) {
 | 
			
		||||
	log.V(100).Info("debug: dev.removePod() Pod %s in ns %s with the GPU ID %d will be removed from device map",
 | 
			
		||||
		pod.Name,
 | 
			
		||||
		pod.Namespace,
 | 
			
		||||
		d.idx)
 | 
			
		||||
	d.rwmu.Lock()
 | 
			
		||||
	defer d.rwmu.Unlock()
 | 
			
		||||
	delete(d.podMap, pod.UID)
 | 
			
		||||
	log.V(100).Info("debug: dev.removePod() after updated is %v, and its address is %p",
 | 
			
		||||
		d.podMap,
 | 
			
		||||
		d)
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										362
									
								
								gpushare-scheduler-extender/pkg/cache/nodeinfo.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										362
									
								
								gpushare-scheduler-extender/pkg/cache/nodeinfo.go
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,362 @@
 | 
			
		||||
package cache
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"context"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"sync"
 | 
			
		||||
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/client-go/kubernetes"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// NodeInfo is node level aggregated information.
 | 
			
		||||
type NodeInfo struct {
 | 
			
		||||
	ctx            context.Context
 | 
			
		||||
	name           string
 | 
			
		||||
	node           *v1.Node
 | 
			
		||||
	devs           map[int]*DeviceInfo
 | 
			
		||||
	gpuCount       int
 | 
			
		||||
	gpuTotalMemory int
 | 
			
		||||
	rwmu           *sync.RWMutex
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create Node Level
 | 
			
		||||
func NewNodeInfo(node *v1.Node) *NodeInfo {
 | 
			
		||||
	log.V(10).Info("debug: NewNodeInfo() creates nodeInfo for %s", node.Name)
 | 
			
		||||
 | 
			
		||||
	devMap := map[int]*DeviceInfo{}
 | 
			
		||||
	for i := 0; i < utils.GetGPUCountInNode(node); i++ {
 | 
			
		||||
		devMap[i] = newDeviceInfo(i, uint(utils.GetTotalGPUMemory(node)/utils.GetGPUCountInNode(node)))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if len(devMap) == 0 {
 | 
			
		||||
		log.V(3).Info("warn: node %s with nodeinfo %v has no devices", node.Name, node)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return &NodeInfo{
 | 
			
		||||
		ctx:            context.Background(),
 | 
			
		||||
		name:           node.Name,
 | 
			
		||||
		node:           node,
 | 
			
		||||
		devs:           devMap,
 | 
			
		||||
		gpuCount:       utils.GetGPUCountInNode(node),
 | 
			
		||||
		gpuTotalMemory: utils.GetTotalGPUMemory(node),
 | 
			
		||||
		rwmu:           new(sync.RWMutex),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Only update the devices when the length of devs is 0
 | 
			
		||||
func (n *NodeInfo) Reset(node *v1.Node) {
 | 
			
		||||
	n.gpuCount = utils.GetGPUCountInNode(node)
 | 
			
		||||
	n.gpuTotalMemory = utils.GetTotalGPUMemory(node)
 | 
			
		||||
	n.node = node
 | 
			
		||||
	if n.gpuCount == 0 {
 | 
			
		||||
		log.V(3).Info("warn: Reset for node %s but the gpu count is 0", node.Name)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if n.gpuTotalMemory == 0 {
 | 
			
		||||
		log.V(3).Info("warn: Reset for node %s but the gpu total memory is 0", node.Name)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if len(n.devs) == 0 && n.gpuCount > 0 {
 | 
			
		||||
		devMap := map[int]*DeviceInfo{}
 | 
			
		||||
		for i := 0; i < utils.GetGPUCountInNode(node); i++ {
 | 
			
		||||
			devMap[i] = newDeviceInfo(i, uint(n.gpuTotalMemory/n.gpuCount))
 | 
			
		||||
		}
 | 
			
		||||
		n.devs = devMap
 | 
			
		||||
	}
 | 
			
		||||
	log.V(3).Info("info: Reset() update nodeInfo for %s with devs %v", node.Name, n.devs)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) GetName() string {
 | 
			
		||||
	return n.name
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) GetDevs() []*DeviceInfo {
 | 
			
		||||
	devs := make([]*DeviceInfo, n.gpuCount)
 | 
			
		||||
	for i, dev := range n.devs {
 | 
			
		||||
		devs[i] = dev
 | 
			
		||||
	}
 | 
			
		||||
	return devs
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) GetNode() *v1.Node {
 | 
			
		||||
	return n.node
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) GetTotalGPUMemory() int {
 | 
			
		||||
	return n.gpuTotalMemory
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) GetGPUCount() int {
 | 
			
		||||
	return n.gpuCount
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) removePod(pod *v1.Pod) {
 | 
			
		||||
	n.rwmu.Lock()
 | 
			
		||||
	defer n.rwmu.Unlock()
 | 
			
		||||
 | 
			
		||||
	id := utils.GetGPUIDFromAnnotation(pod)
 | 
			
		||||
	if id >= 0 {
 | 
			
		||||
		dev, found := n.devs[id]
 | 
			
		||||
		if !found {
 | 
			
		||||
			log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
 | 
			
		||||
		} else {
 | 
			
		||||
			dev.removePod(pod)
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(3).Info("warn: Pod %s in ns %s is not set the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Add the Pod which has the GPU id to the node
 | 
			
		||||
func (n *NodeInfo) addOrUpdatePod(pod *v1.Pod) (added bool) {
 | 
			
		||||
	n.rwmu.Lock()
 | 
			
		||||
	defer n.rwmu.Unlock()
 | 
			
		||||
 | 
			
		||||
	id := utils.GetGPUIDFromAnnotation(pod)
 | 
			
		||||
	log.V(3).Info("debug: addOrUpdatePod() Pod %s in ns %s with the GPU ID %d should be added to device map",
 | 
			
		||||
		pod.Name,
 | 
			
		||||
		pod.Namespace,
 | 
			
		||||
		id)
 | 
			
		||||
	if id >= 0 {
 | 
			
		||||
		dev, found := n.devs[id]
 | 
			
		||||
		if !found {
 | 
			
		||||
			log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
 | 
			
		||||
		} else {
 | 
			
		||||
			dev.addPod(pod)
 | 
			
		||||
			added = true
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(3).Info("warn: Pod %s in ns %s is not set the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
 | 
			
		||||
	}
 | 
			
		||||
	return added
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// check if the pod can be allocated on the node
 | 
			
		||||
func (n *NodeInfo) Assume(pod *v1.Pod) (allocatable bool) {
 | 
			
		||||
	allocatable = false
 | 
			
		||||
 | 
			
		||||
	n.rwmu.RLock()
 | 
			
		||||
	defer n.rwmu.RUnlock()
 | 
			
		||||
 | 
			
		||||
	availableGPUs := n.getAvailableGPUs()
 | 
			
		||||
	reqGPU := uint(utils.GetGPUMemoryFromPodResource(pod))
 | 
			
		||||
	log.V(10).Info("debug: AvailableGPUs: %v in node %s", availableGPUs, n.name)
 | 
			
		||||
 | 
			
		||||
	if len(availableGPUs) > 0 {
 | 
			
		||||
		for devID := 0; devID < len(n.devs); devID++ {
 | 
			
		||||
			availableGPU, ok := availableGPUs[devID]
 | 
			
		||||
			if ok {
 | 
			
		||||
				if availableGPU >= reqGPU {
 | 
			
		||||
					allocatable = true
 | 
			
		||||
					break
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return allocatable
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) Allocate(clientset *kubernetes.Clientset, pod *v1.Pod) (err error) {
 | 
			
		||||
	var newPod *v1.Pod
 | 
			
		||||
	n.rwmu.Lock()
 | 
			
		||||
	defer n.rwmu.Unlock()
 | 
			
		||||
	log.V(3).Info("info: Allocate() ----Begin to allocate GPU for gpu mem for pod %s in ns %s----", pod.Name, pod.Namespace)
 | 
			
		||||
	// 1. Update the pod spec
 | 
			
		||||
	devId, found := n.allocateGPUID(pod)
 | 
			
		||||
	if found {
 | 
			
		||||
		log.V(3).Info("info: Allocate() 1. Allocate GPU ID %d to pod %s in ns %s.----", devId, pod.Name, pod.Namespace)
 | 
			
		||||
		// newPod := utils.GetUpdatedPodEnvSpec(pod, devId, nodeInfo.GetTotalGPUMemory()/nodeInfo.GetGPUCount())
 | 
			
		||||
		//newPod = utils.GetUpdatedPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
 | 
			
		||||
		patchedAnnotationBytes, err := utils.PatchPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return fmt.Errorf("failed to generate patched annotations,reason: %v", err)
 | 
			
		||||
		}
 | 
			
		||||
		newPod, err = clientset.CoreV1().Pods(pod.Namespace).Patch(n.ctx, pod.Name, types.StrategicMergePatchType, patchedAnnotationBytes, metav1.PatchOptions{})
 | 
			
		||||
		//_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			// the object has been modified; please apply your changes to the latest version and try again
 | 
			
		||||
			if err.Error() == OptimisticLockErrorMsg {
 | 
			
		||||
				// retry
 | 
			
		||||
				pod, err = clientset.CoreV1().Pods(pod.Namespace).Get(n.ctx, pod.Name, metav1.GetOptions{})
 | 
			
		||||
				if err != nil {
 | 
			
		||||
					return err
 | 
			
		||||
				}
 | 
			
		||||
				// newPod = utils.GetUpdatedPodEnvSpec(pod, devId, nodeInfo.GetTotalGPUMemory()/nodeInfo.GetGPUCount())
 | 
			
		||||
				//newPod = utils.GetUpdatedPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
 | 
			
		||||
				//_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
 | 
			
		||||
				newPod, err = clientset.CoreV1().Pods(pod.Namespace).Patch(n.ctx, pod.Name, types.StrategicMergePatchType, patchedAnnotationBytes, metav1.PatchOptions{})
 | 
			
		||||
				if err != nil {
 | 
			
		||||
					return err
 | 
			
		||||
				}
 | 
			
		||||
			} else {
 | 
			
		||||
				log.V(3).Info("failed to patch pod %v", pod)
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		err = fmt.Errorf("The node %s can't place the pod %s in ns %s,and the pod spec is %v", pod.Spec.NodeName, pod.Name, pod.Namespace, pod)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// 2. Bind the pod to the node
 | 
			
		||||
	if err == nil {
 | 
			
		||||
		binding := &v1.Binding{
 | 
			
		||||
			ObjectMeta: metav1.ObjectMeta{Name: pod.Name, UID: pod.UID},
 | 
			
		||||
			Target:     v1.ObjectReference{Kind: "Node", Name: n.name},
 | 
			
		||||
		}
 | 
			
		||||
		log.V(3).Info("info: Allocate() 2. Try to bind pod %s in %s namespace to node %s with %v",
 | 
			
		||||
			pod.Name,
 | 
			
		||||
			pod.Namespace,
 | 
			
		||||
			pod.Spec.NodeName,
 | 
			
		||||
			binding)
 | 
			
		||||
		err = clientset.CoreV1().Pods(pod.Namespace).Bind(n.ctx, binding, metav1.CreateOptions{})
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.V(3).Info("warn: Failed to bind the pod %s in ns %s due to %v", pod.Name, pod.Namespace, err)
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// 3. update the device info if the pod is update successfully
 | 
			
		||||
	if err == nil {
 | 
			
		||||
		log.V(3).Info("info: Allocate() 3. Try to add pod %s in ns %s to dev %d",
 | 
			
		||||
			pod.Name,
 | 
			
		||||
			pod.Namespace,
 | 
			
		||||
			devId)
 | 
			
		||||
		dev, found := n.devs[devId]
 | 
			
		||||
		if !found {
 | 
			
		||||
			log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, devId, n.name)
 | 
			
		||||
		} else {
 | 
			
		||||
			dev.addPod(newPod)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	log.V(3).Info("info: Allocate() ----End to allocate GPU for gpu mem for pod %s in ns %s----", pod.Name, pod.Namespace)
 | 
			
		||||
	return err
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// allocate the GPU ID to the pod
 | 
			
		||||
func (n *NodeInfo) allocateGPUID(pod *v1.Pod) (candidateDevID int, found bool) {
 | 
			
		||||
 | 
			
		||||
	reqGPU := uint(0)
 | 
			
		||||
	found = false
 | 
			
		||||
	candidateDevID = -1
 | 
			
		||||
	candidateGPUMemory := uint(0)
 | 
			
		||||
	availableGPUs := n.getAvailableGPUs()
 | 
			
		||||
 | 
			
		||||
	reqGPU = uint(utils.GetGPUMemoryFromPodResource(pod))
 | 
			
		||||
 | 
			
		||||
	if reqGPU > uint(0) {
 | 
			
		||||
		log.V(3).Info("info: reqGPU for pod %s in ns %s: %d", pod.Name, pod.Namespace, reqGPU)
 | 
			
		||||
		log.V(3).Info("info: AvailableGPUs: %v in node %s", availableGPUs, n.name)
 | 
			
		||||
		if len(availableGPUs) > 0 {
 | 
			
		||||
			for devID := 0; devID < len(n.devs); devID++ {
 | 
			
		||||
				availableGPU, ok := availableGPUs[devID]
 | 
			
		||||
				if ok {
 | 
			
		||||
					if availableGPU >= reqGPU {
 | 
			
		||||
						if candidateDevID == -1 || candidateGPUMemory > availableGPU {
 | 
			
		||||
							candidateDevID = devID
 | 
			
		||||
							candidateGPUMemory = availableGPU
 | 
			
		||||
						}
 | 
			
		||||
 | 
			
		||||
						found = true
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if found {
 | 
			
		||||
			log.V(3).Info("info: Find candidate dev id %d for pod %s in ns %s successfully.",
 | 
			
		||||
				candidateDevID,
 | 
			
		||||
				pod.Name,
 | 
			
		||||
				pod.Namespace)
 | 
			
		||||
		} else {
 | 
			
		||||
			log.V(3).Info("warn: Failed to find available GPUs %d for the pod %s in the namespace %s",
 | 
			
		||||
				reqGPU,
 | 
			
		||||
				pod.Name,
 | 
			
		||||
				pod.Namespace)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return candidateDevID, found
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (n *NodeInfo) getAvailableGPUs() (availableGPUs map[int]uint) {
 | 
			
		||||
	allGPUs := n.getAllGPUs()
 | 
			
		||||
	usedGPUs := n.getUsedGPUs()
 | 
			
		||||
	unhealthyGPUs := n.getUnhealthyGPUs()
 | 
			
		||||
	availableGPUs = map[int]uint{}
 | 
			
		||||
	for id, totalGPUMem := range allGPUs {
 | 
			
		||||
		if usedGPUMem, found := usedGPUs[id]; found {
 | 
			
		||||
			availableGPUs[id] = totalGPUMem - usedGPUMem
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	log.V(3).Info("info: available GPU list %v before removing unhealty GPUs", availableGPUs)
 | 
			
		||||
	for id, _ := range unhealthyGPUs {
 | 
			
		||||
		log.V(3).Info("info: delete dev %d from availble GPU list", id)
 | 
			
		||||
		delete(availableGPUs, id)
 | 
			
		||||
	}
 | 
			
		||||
	log.V(3).Info("info: available GPU list %v after removing unhealty GPUs", availableGPUs)
 | 
			
		||||
 | 
			
		||||
	return availableGPUs
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// device index: gpu memory
 | 
			
		||||
func (n *NodeInfo) getUsedGPUs() (usedGPUs map[int]uint) {
 | 
			
		||||
	usedGPUs = map[int]uint{}
 | 
			
		||||
	for _, dev := range n.devs {
 | 
			
		||||
		usedGPUs[dev.idx] = dev.GetUsedGPUMemory()
 | 
			
		||||
	}
 | 
			
		||||
	log.V(3).Info("info: getUsedGPUs: %v in node %s, and devs %v", usedGPUs, n.name, n.devs)
 | 
			
		||||
	return usedGPUs
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// device index: gpu memory
 | 
			
		||||
func (n *NodeInfo) getAllGPUs() (allGPUs map[int]uint) {
 | 
			
		||||
	allGPUs = map[int]uint{}
 | 
			
		||||
	for _, dev := range n.devs {
 | 
			
		||||
		allGPUs[dev.idx] = dev.totalGPUMem
 | 
			
		||||
	}
 | 
			
		||||
	log.V(3).Info("info: getAllGPUs: %v in node %s, and dev %v", allGPUs, n.name, n.devs)
 | 
			
		||||
	return allGPUs
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// getUnhealthyGPUs get the unhealthy GPUs from configmap
 | 
			
		||||
func (n *NodeInfo) getUnhealthyGPUs() (unhealthyGPUs map[int]bool) {
 | 
			
		||||
	unhealthyGPUs = map[int]bool{}
 | 
			
		||||
	name := fmt.Sprintf("unhealthy-gpu-%s", n.GetName())
 | 
			
		||||
	log.V(3).Info("info: try to find unhealthy node %s", name)
 | 
			
		||||
	cm := getConfigMap(name)
 | 
			
		||||
	if cm == nil {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if devicesStr, found := cm.Data["gpus"]; found {
 | 
			
		||||
		log.V(3).Info("warn: the unhelathy gpus %s", devicesStr)
 | 
			
		||||
		idsStr := strings.Split(devicesStr, ",")
 | 
			
		||||
		for _, sid := range idsStr {
 | 
			
		||||
			id, err := strconv.Atoi(sid)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.V(3).Info("warn: failed to parse id %s due to %v", sid, err)
 | 
			
		||||
			}
 | 
			
		||||
			unhealthyGPUs[id] = true
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(3).Info("info: skip, because there are no unhealthy gpus")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										346
									
								
								gpushare-scheduler-extender/pkg/gpushare/controller.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										346
									
								
								gpushare-scheduler-extender/pkg/gpushare/controller.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,346 @@
 | 
			
		||||
package gpushare
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	"golang.org/x/time/rate"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/api/errors"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/util/runtime"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/util/wait"
 | 
			
		||||
	kubeinformers "k8s.io/client-go/informers"
 | 
			
		||||
	"k8s.io/client-go/kubernetes"
 | 
			
		||||
	"k8s.io/client-go/kubernetes/scheme"
 | 
			
		||||
	clientgocache "k8s.io/client-go/tools/cache"
 | 
			
		||||
	"k8s.io/client-go/util/workqueue"
 | 
			
		||||
 | 
			
		||||
	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
 | 
			
		||||
	corelisters "k8s.io/client-go/listers/core/v1"
 | 
			
		||||
	"k8s.io/client-go/tools/record"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	KeyFunc = clientgocache.DeletionHandlingMetaNamespaceKeyFunc
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type Controller struct {
 | 
			
		||||
	clientset *kubernetes.Clientset
 | 
			
		||||
 | 
			
		||||
	// podLister can list/get pods from the shared informer's store.
 | 
			
		||||
	podLister corelisters.PodLister
 | 
			
		||||
 | 
			
		||||
	// nodeLister can list/get nodes from the shared informer's store.
 | 
			
		||||
	nodeLister corelisters.NodeLister
 | 
			
		||||
 | 
			
		||||
	// podQueue is a rate limited work queue. This is used to queue work to be
 | 
			
		||||
	// processed instead of performing it as soon as a change happens. This
 | 
			
		||||
	// means we can ensure we only process a fixed amount of resources at a
 | 
			
		||||
	// time, and makes it easy to ensure we are never processing the same item
 | 
			
		||||
	// simultaneously in two different workers.
 | 
			
		||||
	podQueue workqueue.RateLimitingInterface
 | 
			
		||||
 | 
			
		||||
	// recorder is an event recorder for recording Event resources to the
 | 
			
		||||
	// Kubernetes API.
 | 
			
		||||
	recorder record.EventRecorder
 | 
			
		||||
 | 
			
		||||
	// podInformerSynced returns true if the pod store has been synced at least once.
 | 
			
		||||
	podInformerSynced clientgocache.InformerSynced
 | 
			
		||||
 | 
			
		||||
	// nodeInformerSynced returns true if the service store has been synced at least once.
 | 
			
		||||
	nodeInformerSynced clientgocache.InformerSynced
 | 
			
		||||
 | 
			
		||||
	schedulerCache *cache.SchedulerCache
 | 
			
		||||
 | 
			
		||||
	// The cache to store the pod to be removed
 | 
			
		||||
	removePodCache map[string]*v1.Pod
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewController(clientset *kubernetes.Clientset, kubeInformerFactory kubeinformers.SharedInformerFactory, stopCh <-chan struct{}) (*Controller, error) {
 | 
			
		||||
	log.V(100).Info("info: Creating event broadcaster")
 | 
			
		||||
	eventBroadcaster := record.NewBroadcaster()
 | 
			
		||||
	// eventBroadcaster.StartLogging(log.Infof)
 | 
			
		||||
	eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientset.CoreV1().Events("")})
 | 
			
		||||
	recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "gpushare-schd-extender"})
 | 
			
		||||
 | 
			
		||||
	rateLimiter := workqueue.NewMaxOfRateLimiter(
 | 
			
		||||
		workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 1000*time.Second),
 | 
			
		||||
		&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(100), 500)},
 | 
			
		||||
	)
 | 
			
		||||
	c := &Controller{
 | 
			
		||||
		clientset:      clientset,
 | 
			
		||||
		podQueue:       workqueue.NewNamedRateLimitingQueue(rateLimiter, "podQueue"),
 | 
			
		||||
		recorder:       recorder,
 | 
			
		||||
		removePodCache: map[string]*v1.Pod{},
 | 
			
		||||
	}
 | 
			
		||||
	// Create pod informer.
 | 
			
		||||
	podInformer := kubeInformerFactory.Core().V1().Pods()
 | 
			
		||||
	podInformer.Informer().AddEventHandler(clientgocache.FilteringResourceEventHandler{
 | 
			
		||||
		FilterFunc: func(obj interface{}) bool {
 | 
			
		||||
			switch t := obj.(type) {
 | 
			
		||||
			case *v1.Pod:
 | 
			
		||||
				// log.V(100).Info("debug: added pod %s in ns %s", t.Name, t.Namespace)
 | 
			
		||||
				return utils.IsGPUsharingPod(t)
 | 
			
		||||
			case clientgocache.DeletedFinalStateUnknown:
 | 
			
		||||
				if pod, ok := t.Obj.(*v1.Pod); ok {
 | 
			
		||||
					log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
 | 
			
		||||
					return utils.IsGPUsharingPod(pod)
 | 
			
		||||
				}
 | 
			
		||||
				runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
 | 
			
		||||
				return false
 | 
			
		||||
			default:
 | 
			
		||||
				runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
 | 
			
		||||
				return false
 | 
			
		||||
			}
 | 
			
		||||
		},
 | 
			
		||||
		Handler: clientgocache.ResourceEventHandlerFuncs{
 | 
			
		||||
			AddFunc:    c.addPodToCache,
 | 
			
		||||
			UpdateFunc: c.updatePodInCache,
 | 
			
		||||
			DeleteFunc: c.deletePodFromCache,
 | 
			
		||||
		},
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	c.podLister = podInformer.Lister()
 | 
			
		||||
	c.podInformerSynced = podInformer.Informer().HasSynced
 | 
			
		||||
 | 
			
		||||
	// Create node informer
 | 
			
		||||
	nodeInformer := kubeInformerFactory.Core().V1().Nodes()
 | 
			
		||||
	c.nodeLister = nodeInformer.Lister()
 | 
			
		||||
	c.nodeInformerSynced = nodeInformer.Informer().HasSynced
 | 
			
		||||
 | 
			
		||||
	// Create configMap informer
 | 
			
		||||
	cmInformer := kubeInformerFactory.Core().V1().ConfigMaps()
 | 
			
		||||
	cache.ConfigMapLister = cmInformer.Lister()
 | 
			
		||||
	cache.ConfigMapInformerSynced = cmInformer.Informer().HasSynced
 | 
			
		||||
 | 
			
		||||
	// Start informer goroutines.
 | 
			
		||||
	go kubeInformerFactory.Start(stopCh)
 | 
			
		||||
 | 
			
		||||
	// Create scheduler Cache
 | 
			
		||||
	c.schedulerCache = cache.NewSchedulerCache(c.nodeLister, c.podLister)
 | 
			
		||||
 | 
			
		||||
	log.V(100).Info("info: begin to wait for cache")
 | 
			
		||||
 | 
			
		||||
	if ok := clientgocache.WaitForCacheSync(stopCh, c.nodeInformerSynced); !ok {
 | 
			
		||||
		return nil, fmt.Errorf("failed to wait for node caches to sync")
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(100).Info("info: init the node cache successfully")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if ok := clientgocache.WaitForCacheSync(stopCh, c.podInformerSynced); !ok {
 | 
			
		||||
		return nil, fmt.Errorf("failed to wait for pod caches to sync")
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(100).Info("info: init the pod cache successfully")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if ok := clientgocache.WaitForCacheSync(stopCh, cache.ConfigMapInformerSynced); !ok {
 | 
			
		||||
		return nil, fmt.Errorf("failed to wait for configmap caches to sync")
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(100).Info("info: init the configmap cache successfully")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.V(100).Info("info: end to wait for cache")
 | 
			
		||||
 | 
			
		||||
	return c, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Controller) BuildCache() error {
 | 
			
		||||
	return c.schedulerCache.BuildCache()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Controller) GetSchedulerCache() *cache.SchedulerCache {
 | 
			
		||||
	return c.schedulerCache
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Run will set up the event handlers
 | 
			
		||||
func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
 | 
			
		||||
	defer runtime.HandleCrash()
 | 
			
		||||
	defer c.podQueue.ShutDown()
 | 
			
		||||
 | 
			
		||||
	log.V(9).Info("info: Starting GPU Sharing Controller.")
 | 
			
		||||
	log.V(9).Info("info: Waiting for informer caches to sync")
 | 
			
		||||
 | 
			
		||||
	log.V(9).Info("info: Starting %v workers.", threadiness)
 | 
			
		||||
	for i := 0; i < threadiness; i++ {
 | 
			
		||||
		go wait.Until(c.runWorker, time.Second, stopCh)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.V(3).Info("info: Started workers")
 | 
			
		||||
	<-stopCh
 | 
			
		||||
	log.V(3).Info("info: Shutting down workers")
 | 
			
		||||
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// runWorker is a long-running function that will continually call the
 | 
			
		||||
// processNextWorkItem function in order to read and process a message on the
 | 
			
		||||
// workqueue.
 | 
			
		||||
func (c *Controller) runWorker() {
 | 
			
		||||
	for c.processNextWorkItem() {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// syncPod will sync the pod with the given key if it has had its expectations fulfilled,
 | 
			
		||||
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
 | 
			
		||||
// invoked concurrently with the same key.
 | 
			
		||||
func (c *Controller) syncPod(key string) (forget bool, err error) {
 | 
			
		||||
	ns, name, err := clientgocache.SplitMetaNamespaceKey(key)
 | 
			
		||||
	log.V(9).Info("debug: begin to sync gpushare pod %s in ns %s", name, ns)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return false, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pod, err := c.podLister.Pods(ns).Get(name)
 | 
			
		||||
	switch {
 | 
			
		||||
	case errors.IsNotFound(err):
 | 
			
		||||
		log.V(10).Info("debug: pod %s in ns %s has been deleted.", name, ns)
 | 
			
		||||
		pod, found := c.removePodCache[key]
 | 
			
		||||
		if found {
 | 
			
		||||
			c.schedulerCache.RemovePod(pod)
 | 
			
		||||
			delete(c.removePodCache, key)
 | 
			
		||||
		}
 | 
			
		||||
	case err != nil:
 | 
			
		||||
		log.V(10).Info("warn: unable to retrieve pod %v from the store: %v", key, err)
 | 
			
		||||
	default:
 | 
			
		||||
		if utils.IsCompletePod(pod) {
 | 
			
		||||
			log.V(10).Info("debug: pod %s in ns %s has completed.", name, ns)
 | 
			
		||||
			c.schedulerCache.RemovePod(pod)
 | 
			
		||||
		} else {
 | 
			
		||||
			err := c.schedulerCache.AddOrUpdatePod(pod)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				return false, err
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return true, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// processNextWorkItem will read a single work item off the podQueue and
 | 
			
		||||
// attempt to process it.
 | 
			
		||||
func (c *Controller) processNextWorkItem() bool {
 | 
			
		||||
	log.V(100).Info("debug: begin processNextWorkItem()")
 | 
			
		||||
	key, quit := c.podQueue.Get()
 | 
			
		||||
	if quit {
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
	defer c.podQueue.Done(key)
 | 
			
		||||
	defer log.V(100).Info("debug: end processNextWorkItem()")
 | 
			
		||||
	forget, err := c.syncPod(key.(string))
 | 
			
		||||
	if err == nil {
 | 
			
		||||
		if forget {
 | 
			
		||||
			c.podQueue.Forget(key)
 | 
			
		||||
		}
 | 
			
		||||
		return true
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.V(3).Info("Error syncing pods: %v", err)
 | 
			
		||||
	runtime.HandleError(fmt.Errorf("Error syncing pod: %v", err))
 | 
			
		||||
	c.podQueue.AddRateLimited(key)
 | 
			
		||||
 | 
			
		||||
	return true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Controller) addPodToCache(obj interface{}) {
 | 
			
		||||
	pod, ok := obj.(*v1.Pod)
 | 
			
		||||
	if !ok {
 | 
			
		||||
		log.V(3).Info("warn: cannot convert to *v1.Pod: %v", obj)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// if !assignedNonTerminatedPod(t) {
 | 
			
		||||
	// 	log.V(100).Info("debug: skip pod %s due to it's terminated.", pod.Name)
 | 
			
		||||
	// 	return
 | 
			
		||||
	// }
 | 
			
		||||
 | 
			
		||||
	podKey, err := KeyFunc(pod)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.V(3).Info("warn: Failed to get the jobkey: %v", err)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	c.podQueue.Add(podKey)
 | 
			
		||||
 | 
			
		||||
	// NOTE: Updating equivalence cache of addPodToCache has been
 | 
			
		||||
	// handled optimistically in: pkg/scheduler/scheduler.go#assume()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Controller) updatePodInCache(oldObj, newObj interface{}) {
 | 
			
		||||
	oldPod, ok := oldObj.(*v1.Pod)
 | 
			
		||||
	if !ok {
 | 
			
		||||
		log.V(3).Info("warn: cannot convert oldObj to *v1.Pod: %v", oldObj)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
	newPod, ok := newObj.(*v1.Pod)
 | 
			
		||||
	if !ok {
 | 
			
		||||
		log.V(3).Info("warn: cannot convert newObj to *v1.Pod: %v", newObj)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
	needUpdate := false
 | 
			
		||||
 | 
			
		||||
	podUID := oldPod.UID
 | 
			
		||||
 | 
			
		||||
	// 1. Need update when pod is turned to complete or failed
 | 
			
		||||
	if c.schedulerCache.KnownPod(podUID) && utils.IsCompletePod(newPod) {
 | 
			
		||||
		needUpdate = true
 | 
			
		||||
	}
 | 
			
		||||
	// 2. Need update when it's unknown pod, and GPU annotation has been set
 | 
			
		||||
	if !c.schedulerCache.KnownPod(podUID) && utils.GetGPUIDFromAnnotation(newPod) >= 0 {
 | 
			
		||||
		needUpdate = true
 | 
			
		||||
	}
 | 
			
		||||
	if needUpdate {
 | 
			
		||||
		podKey, err := KeyFunc(newPod)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.V(3).Info("warn: Failed to get the jobkey: %v", err)
 | 
			
		||||
			return
 | 
			
		||||
		}
 | 
			
		||||
		log.V(3).Info("info: Need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
 | 
			
		||||
			newPod.Name,
 | 
			
		||||
			newPod.Namespace,
 | 
			
		||||
			oldPod.Status.Phase,
 | 
			
		||||
			newPod.Status.Phase,
 | 
			
		||||
			oldPod.Annotations,
 | 
			
		||||
			newPod.Annotations)
 | 
			
		||||
		c.podQueue.Add(podKey)
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(100).Info("debug: No need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
 | 
			
		||||
			newPod.Name,
 | 
			
		||||
			newPod.Namespace,
 | 
			
		||||
			oldPod.Status.Phase,
 | 
			
		||||
			newPod.Status.Phase,
 | 
			
		||||
			oldPod.Annotations,
 | 
			
		||||
			newPod.Annotations)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Controller) deletePodFromCache(obj interface{}) {
 | 
			
		||||
	var pod *v1.Pod
 | 
			
		||||
	switch t := obj.(type) {
 | 
			
		||||
	case *v1.Pod:
 | 
			
		||||
		pod = t
 | 
			
		||||
	case clientgocache.DeletedFinalStateUnknown:
 | 
			
		||||
		var ok bool
 | 
			
		||||
		pod, ok = t.Obj.(*v1.Pod)
 | 
			
		||||
		if !ok {
 | 
			
		||||
			log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t.Obj)
 | 
			
		||||
			return
 | 
			
		||||
		}
 | 
			
		||||
	default:
 | 
			
		||||
		log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
 | 
			
		||||
	podKey, err := KeyFunc(pod)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.V(3).Info("warn: Failed to get the jobkey: %v", err)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
	c.podQueue.Add(podKey)
 | 
			
		||||
	c.removePodCache[podKey] = pod
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										70
									
								
								gpushare-scheduler-extender/pkg/log/level.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								gpushare-scheduler-extender/pkg/log/level.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,70 @@
 | 
			
		||||
package log
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"os"
 | 
			
		||||
	"sync"
 | 
			
		||||
 | 
			
		||||
	"go.uber.org/zap"
 | 
			
		||||
	"go.uber.org/zap/zapcore"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type levelLogger struct {
 | 
			
		||||
	level *int32
 | 
			
		||||
	mu    sync.Mutex
 | 
			
		||||
	log   *zap.Logger
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type verbose bool
 | 
			
		||||
 | 
			
		||||
var l *levelLogger
 | 
			
		||||
 | 
			
		||||
func NewLoggerWithLevel(level int32, option ...zap.Option) {
 | 
			
		||||
	cfg := zap.NewProductionEncoderConfig()
 | 
			
		||||
	cfg.EncodeTime = zapcore.ISO8601TimeEncoder
 | 
			
		||||
 | 
			
		||||
	core := zapcore.NewCore(
 | 
			
		||||
		zapcore.NewJSONEncoder(cfg),
 | 
			
		||||
		zapcore.Lock(os.Stdout),
 | 
			
		||||
		zap.NewAtomicLevel(),
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	if option == nil {
 | 
			
		||||
		option = []zap.Option{}
 | 
			
		||||
	}
 | 
			
		||||
	option = append(option, zap.AddCaller(), zap.AddCallerSkip(1))
 | 
			
		||||
	l = &levelLogger{
 | 
			
		||||
		level: &level,
 | 
			
		||||
		mu:    sync.Mutex{},
 | 
			
		||||
		log:   zap.New(core, option...),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
V for log level, normal usage example
 | 
			
		||||
globalLogger default level 3, debug level 10
 | 
			
		||||
example level:
 | 
			
		||||
 | 
			
		||||
	api request  4
 | 
			
		||||
	api response 9
 | 
			
		||||
 | 
			
		||||
	services func 5
 | 
			
		||||
 | 
			
		||||
	db error  9
 | 
			
		||||
	db query  11
 | 
			
		||||
	db result 15
 | 
			
		||||
*/
 | 
			
		||||
func V(level int32) verbose {
 | 
			
		||||
	return level < *l.level
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (v verbose) Info(format string, args ...interface{}) {
 | 
			
		||||
	if v {
 | 
			
		||||
		l.log.Info(fmt.Sprintf(format, args...))
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func Fatal(format string, args ...interface{}) {
 | 
			
		||||
	l.log.Fatal(fmt.Sprintf(format, args...))
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										64
									
								
								gpushare-scheduler-extender/pkg/routes/pprof.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								gpushare-scheduler-extender/pkg/routes/pprof.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,64 @@
 | 
			
		||||
package routes
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"net/http"
 | 
			
		||||
	"net/http/pprof"
 | 
			
		||||
 | 
			
		||||
	"github.com/julienschmidt/httprouter"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func AddPProf(r *httprouter.Router) {
 | 
			
		||||
	r.GET("/debug/pprof/", index)
 | 
			
		||||
	r.GET("/debug/pprof/cmdline/", cmdline)
 | 
			
		||||
	r.GET("/debug/pprof/profile/", profile)
 | 
			
		||||
	r.GET("/debug/pprof/symbol/", symbol)
 | 
			
		||||
	r.GET("/debug/pprof/trace/", trace)
 | 
			
		||||
 | 
			
		||||
	r.GET("/debug/pprof/heap/", heap)
 | 
			
		||||
	r.GET("/debug/pprof/goroutine/", goroutine)
 | 
			
		||||
	r.GET("/debug/pprof/block/", block)
 | 
			
		||||
	r.GET("/debug/pprof/threadcreate/", threadcreate)
 | 
			
		||||
	r.GET("/debug/pprof/mutex/", mutex)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// profiling tools handlers
 | 
			
		||||
 | 
			
		||||
func index(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Index(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func cmdline(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Cmdline(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func profile(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Profile(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func symbol(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Symbol(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func trace(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Trace(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func heap(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Handler("heap").ServeHTTP(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func goroutine(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Handler("goroutine").ServeHTTP(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func block(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Handler("block").ServeHTTP(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func threadcreate(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Handler("threadcreate").ServeHTTP(w, r)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func mutex(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	pprof.Handler("mutex").ServeHTTP(w, r)
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										181
									
								
								gpushare-scheduler-extender/pkg/routes/routes.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										181
									
								
								gpushare-scheduler-extender/pkg/routes/routes.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,181 @@
 | 
			
		||||
package routes
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"encoding/json"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	"io"
 | 
			
		||||
	"net/http"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	"github.com/julienschmidt/httprouter"
 | 
			
		||||
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/scheduler"
 | 
			
		||||
 | 
			
		||||
	schedulerapi "k8s.io/kube-scheduler/extender/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	versionPath       = "/version"
 | 
			
		||||
	apiPrefix         = "/gpushare-scheduler"
 | 
			
		||||
	bindPrefix        = apiPrefix + "/bind"
 | 
			
		||||
	predicatesPrefix  = apiPrefix + "/filter"
 | 
			
		||||
	inspectPrefix     = apiPrefix + "/inspect/:nodename"
 | 
			
		||||
	inspectListPrefix = apiPrefix + "/inspect"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	version = "0.1.0"
 | 
			
		||||
	// mu      sync.RWMutex
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func checkBody(w http.ResponseWriter, r *http.Request) {
 | 
			
		||||
	if r.Body == nil {
 | 
			
		||||
		http.Error(w, "Please send a request body", 400)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func InspectRoute(inspect *scheduler.Inspect) httprouter.Handle {
 | 
			
		||||
	return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 | 
			
		||||
		result := inspect.Handler(ps.ByName("nodename"))
 | 
			
		||||
 | 
			
		||||
		if resultBody, err := json.Marshal(result); err != nil {
 | 
			
		||||
			// panic(err)
 | 
			
		||||
			log.V(3).Info("warn: Failed due to %v", err)
 | 
			
		||||
			w.Header().Set("Content-Type", "application/json")
 | 
			
		||||
			w.WriteHeader(http.StatusInternalServerError)
 | 
			
		||||
			errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
 | 
			
		||||
			w.Write([]byte(errMsg))
 | 
			
		||||
		} else {
 | 
			
		||||
			w.Header().Set("Content-Type", "application/json")
 | 
			
		||||
			w.WriteHeader(http.StatusOK)
 | 
			
		||||
			w.Write(resultBody)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func PredicateRoute(predicate *scheduler.Predicate) httprouter.Handle {
 | 
			
		||||
	return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
		checkBody(w, r)
 | 
			
		||||
 | 
			
		||||
		// mu.RLock()
 | 
			
		||||
		// defer mu.RUnlock()
 | 
			
		||||
 | 
			
		||||
		var buf bytes.Buffer
 | 
			
		||||
		body := io.TeeReader(r.Body, &buf)
 | 
			
		||||
 | 
			
		||||
		var extenderArgs schedulerapi.ExtenderArgs
 | 
			
		||||
		var extenderFilterResult *schedulerapi.ExtenderFilterResult
 | 
			
		||||
 | 
			
		||||
		if err := json.NewDecoder(body).Decode(&extenderArgs); err != nil {
 | 
			
		||||
			log.V(3).Info("warn: failed to parse request due to error %v", err)
 | 
			
		||||
			extenderFilterResult = &schedulerapi.ExtenderFilterResult{
 | 
			
		||||
				Nodes:       nil,
 | 
			
		||||
				FailedNodes: nil,
 | 
			
		||||
				Error:       err.Error(),
 | 
			
		||||
			}
 | 
			
		||||
		} else {
 | 
			
		||||
			log.V(90).Info("debug: gpusharingfilter ExtenderArgs =%v", extenderArgs)
 | 
			
		||||
			extenderFilterResult = predicate.Handler(&extenderArgs)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if resultBody, err := json.Marshal(extenderFilterResult); err != nil {
 | 
			
		||||
			// panic(err)
 | 
			
		||||
			log.V(3).Info("warn: Failed due to %v", err)
 | 
			
		||||
			w.Header().Set("Content-Type", "application/json")
 | 
			
		||||
			w.WriteHeader(http.StatusInternalServerError)
 | 
			
		||||
			errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
 | 
			
		||||
			w.Write([]byte(errMsg))
 | 
			
		||||
		} else {
 | 
			
		||||
			log.V(100).Info("predicate: %s,  extenderFilterResult = %s ", predicate.Name, resultBody)
 | 
			
		||||
			w.Header().Set("Content-Type", "application/json")
 | 
			
		||||
			w.WriteHeader(http.StatusOK)
 | 
			
		||||
			w.Write(resultBody)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func BindRoute(bind *scheduler.Bind) httprouter.Handle {
 | 
			
		||||
	return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
		checkBody(w, r)
 | 
			
		||||
 | 
			
		||||
		// mu.Lock()
 | 
			
		||||
		// defer mu.Unlock()
 | 
			
		||||
		var buf bytes.Buffer
 | 
			
		||||
		body := io.TeeReader(r.Body, &buf)
 | 
			
		||||
 | 
			
		||||
		var extenderBindingArgs schedulerapi.ExtenderBindingArgs
 | 
			
		||||
		var extenderBindingResult *schedulerapi.ExtenderBindingResult
 | 
			
		||||
		failed := false
 | 
			
		||||
 | 
			
		||||
		if err := json.NewDecoder(body).Decode(&extenderBindingArgs); err != nil {
 | 
			
		||||
			extenderBindingResult = &schedulerapi.ExtenderBindingResult{
 | 
			
		||||
				Error: err.Error(),
 | 
			
		||||
			}
 | 
			
		||||
			failed = true
 | 
			
		||||
		} else {
 | 
			
		||||
			log.V(10).Info("debug: gpusharingBind ExtenderArgs =%v", extenderBindingArgs)
 | 
			
		||||
			extenderBindingResult = bind.Handler(extenderBindingArgs)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if len(extenderBindingResult.Error) > 0 {
 | 
			
		||||
			failed = true
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if resultBody, err := json.Marshal(extenderBindingResult); err != nil {
 | 
			
		||||
			log.V(3).Info("warn: Failed due to %v", err)
 | 
			
		||||
			// panic(err)
 | 
			
		||||
			w.Header().Set("Content-Type", "application/json")
 | 
			
		||||
			w.WriteHeader(http.StatusInternalServerError)
 | 
			
		||||
			errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
 | 
			
		||||
			w.Write([]byte(errMsg))
 | 
			
		||||
		} else {
 | 
			
		||||
			log.V(3).Info("info: extenderBindingResult = %s", resultBody)
 | 
			
		||||
			w.Header().Set("Content-Type", "application/json")
 | 
			
		||||
			if failed {
 | 
			
		||||
				w.WriteHeader(http.StatusInternalServerError)
 | 
			
		||||
			} else {
 | 
			
		||||
				w.WriteHeader(http.StatusOK)
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			w.Write(resultBody)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func VersionRoute(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
 | 
			
		||||
	fmt.Fprint(w, fmt.Sprint(version))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func AddVersion(router *httprouter.Router) {
 | 
			
		||||
	router.GET(versionPath, DebugLogging(VersionRoute, versionPath))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func DebugLogging(h httprouter.Handle, path string) httprouter.Handle {
 | 
			
		||||
	return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
 | 
			
		||||
		log.V(90).Info("path: %s, request body = %s", path, r.Body)
 | 
			
		||||
		startTime := time.Now()
 | 
			
		||||
		h(w, r, p)
 | 
			
		||||
		log.V(90).Info("path: %s, response: %v, cost_time: %v", path, w, time.Now().Sub(startTime))
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func AddPredicate(router *httprouter.Router, predicate *scheduler.Predicate) {
 | 
			
		||||
	// path := predicatesPrefix + "/" + predicate.Name
 | 
			
		||||
	router.POST(predicatesPrefix, DebugLogging(PredicateRoute(predicate), predicatesPrefix))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func AddBind(router *httprouter.Router, bind *scheduler.Bind) {
 | 
			
		||||
	if handle, _, _ := router.Lookup("POST", bindPrefix); handle != nil {
 | 
			
		||||
		log.V(3).Info("warning: AddBind was called more then once!")
 | 
			
		||||
	} else {
 | 
			
		||||
		router.POST(bindPrefix, DebugLogging(BindRoute(bind), bindPrefix))
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func AddInspect(router *httprouter.Router, inspect *scheduler.Inspect) {
 | 
			
		||||
	router.GET(inspectPrefix, DebugLogging(InspectRoute(inspect), inspectPrefix))
 | 
			
		||||
	router.GET(inspectListPrefix, DebugLogging(InspectRoute(inspect), inspectListPrefix))
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										26
									
								
								gpushare-scheduler-extender/pkg/scheduler/bind.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								gpushare-scheduler-extender/pkg/scheduler/bind.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,26 @@
 | 
			
		||||
package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
	schedulerapi "k8s.io/kube-scheduler/extender/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// Bind is responsible for binding node and pod
 | 
			
		||||
type Bind struct {
 | 
			
		||||
	Name  string
 | 
			
		||||
	Func  func(podName string, podNamespace string, podUID types.UID, node string, cache *cache.SchedulerCache) error
 | 
			
		||||
	cache *cache.SchedulerCache
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Handler handles the Bind request
 | 
			
		||||
func (b Bind) Handler(args schedulerapi.ExtenderBindingArgs) *schedulerapi.ExtenderBindingResult {
 | 
			
		||||
	err := b.Func(args.PodName, args.PodNamespace, args.PodUID, args.Node, b.cache)
 | 
			
		||||
	errMsg := ""
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		errMsg = err.Error()
 | 
			
		||||
	}
 | 
			
		||||
	return &schedulerapi.ExtenderBindingResult{
 | 
			
		||||
		Error: errMsg,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										71
									
								
								gpushare-scheduler-extender/pkg/scheduler/gpushare-bind.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								gpushare-scheduler-extender/pkg/scheduler/gpushare-bind.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,71 @@
 | 
			
		||||
package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"context"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/api/errors"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
	"k8s.io/client-go/kubernetes"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func NewGPUShareBind(ctx context.Context, clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Bind {
 | 
			
		||||
	return &Bind{
 | 
			
		||||
		Name: "gpusharingbinding",
 | 
			
		||||
		Func: func(name string, namespace string, podUID types.UID, node string, c *cache.SchedulerCache) error {
 | 
			
		||||
			pod, err := getPod(ctx, name, namespace, podUID, clientset, c)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			nodeInfo, err := c.GetNodeInfo(node)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
			err = nodeInfo.Allocate(clientset, pod)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
			return nil
 | 
			
		||||
		},
 | 
			
		||||
		cache: c,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getPod(ctx context.Context, name string, namespace string, podUID types.UID, clientset *kubernetes.Clientset, c *cache.SchedulerCache) (pod *v1.Pod, err error) {
 | 
			
		||||
	pod, err = c.GetPod(name, namespace)
 | 
			
		||||
	if errors.IsNotFound(err) {
 | 
			
		||||
		pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return nil, err
 | 
			
		||||
		}
 | 
			
		||||
	} else if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	if pod.UID != podUID {
 | 
			
		||||
		pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return nil, err
 | 
			
		||||
		}
 | 
			
		||||
		if pod.UID != podUID {
 | 
			
		||||
			return nil, fmt.Errorf("The pod %s in ns %s's uid is %v, and it's not equal with expected %v",
 | 
			
		||||
				name,
 | 
			
		||||
				namespace,
 | 
			
		||||
				pod.UID,
 | 
			
		||||
				podUID)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return pod, nil
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,42 @@
 | 
			
		||||
package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func NewGPUShareInspect(c *cache.SchedulerCache) *Inspect {
 | 
			
		||||
	return &Inspect{
 | 
			
		||||
		Name:  "gpushareinspect",
 | 
			
		||||
		cache: c,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Result struct {
 | 
			
		||||
	Nodes []*Node `json:"nodes"`
 | 
			
		||||
	Error string  `json:"error,omitempty"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Node struct {
 | 
			
		||||
	Name     string    `json:"name"`
 | 
			
		||||
	TotalGPU uint      `json:"totalGPU"`
 | 
			
		||||
	UsedGPU  uint      `json:"usedGPU"`
 | 
			
		||||
	Devices  []*Device `json:"devs"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Device struct {
 | 
			
		||||
	ID       int    `json:"id"`
 | 
			
		||||
	TotalGPU uint   `json:"totalGPU"`
 | 
			
		||||
	UsedGPU  uint   `json:"usedGPU"`
 | 
			
		||||
	Pods     []*Pod `json:"pods"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Pod struct {
 | 
			
		||||
	Name      string `json:"name"`
 | 
			
		||||
	Namespace string `json:"namespace"`
 | 
			
		||||
	UsedGPU   int    `json:"usedGPU"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Inspect struct {
 | 
			
		||||
	Name  string
 | 
			
		||||
	cache *cache.SchedulerCache
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,10 @@
 | 
			
		||||
package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
	"k8s.io/client-go/kubernetes"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func NewGPUsharePredicate(clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Predicate {
 | 
			
		||||
	return &Predicate{Name: "gpusharingfilter", cache: c}
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										69
									
								
								gpushare-scheduler-extender/pkg/scheduler/inspect.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								gpushare-scheduler-extender/pkg/scheduler/inspect.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,69 @@
 | 
			
		||||
package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func (in Inspect) Handler(name string) *Result {
 | 
			
		||||
	nodes := []*Node{}
 | 
			
		||||
	errMsg := ""
 | 
			
		||||
	if len(name) == 0 {
 | 
			
		||||
		nodeInfos := in.cache.GetNodeinfos()
 | 
			
		||||
		for _, info := range nodeInfos {
 | 
			
		||||
			nodes = append(nodes, buildNode(info))
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
	} else {
 | 
			
		||||
		node, err := in.cache.GetNodeInfo(name)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			errMsg = err.Error()
 | 
			
		||||
		}
 | 
			
		||||
		// nodeInfos = append(nodeInfos, node)
 | 
			
		||||
		nodes = append(nodes, buildNode(node))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return &Result{
 | 
			
		||||
		Nodes: nodes,
 | 
			
		||||
		Error: errMsg,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func buildNode(info *cache.NodeInfo) *Node {
 | 
			
		||||
 | 
			
		||||
	devInfos := info.GetDevs()
 | 
			
		||||
	devs := []*Device{}
 | 
			
		||||
	var usedGPU uint
 | 
			
		||||
 | 
			
		||||
	for i, devInfo := range devInfos {
 | 
			
		||||
		dev := &Device{
 | 
			
		||||
			ID:       i,
 | 
			
		||||
			TotalGPU: devInfo.GetTotalGPUMemory(),
 | 
			
		||||
			UsedGPU:  devInfo.GetUsedGPUMemory(),
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		podInfos := devInfo.GetPods()
 | 
			
		||||
		pods := []*Pod{}
 | 
			
		||||
		for _, podInfo := range podInfos {
 | 
			
		||||
			if utils.AssignedNonTerminatedPod(podInfo) {
 | 
			
		||||
				pod := &Pod{
 | 
			
		||||
					Namespace: podInfo.Namespace,
 | 
			
		||||
					Name:      podInfo.Name,
 | 
			
		||||
					UsedGPU:   utils.GetGPUMemoryFromPodResource(podInfo),
 | 
			
		||||
				}
 | 
			
		||||
				pods = append(pods, pod)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		dev.Pods = pods
 | 
			
		||||
		devs = append(devs, dev)
 | 
			
		||||
		usedGPU += devInfo.GetUsedGPUMemory()
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return &Node{
 | 
			
		||||
		Name:     info.GetName(),
 | 
			
		||||
		TotalGPU: uint(info.GetTotalGPUMemory()),
 | 
			
		||||
		UsedGPU:  usedGPU,
 | 
			
		||||
		Devices:  devs,
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										87
									
								
								gpushare-scheduler-extender/pkg/scheduler/predicate.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								gpushare-scheduler-extender/pkg/scheduler/predicate.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,87 @@
 | 
			
		||||
package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	schedulerapi "k8s.io/kube-scheduler/extender/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type Predicate struct {
 | 
			
		||||
	Name  string
 | 
			
		||||
	cache *cache.SchedulerCache
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p Predicate) checkNode(pod *v1.Pod, nodeName string, c *cache.SchedulerCache) (*v1.Node, error) {
 | 
			
		||||
	log.V(10).Info("info: check if the pod name %s can be scheduled on node %s", pod.Name, nodeName)
 | 
			
		||||
	nodeInfo, err := c.GetNodeInfo(nodeName)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	node := nodeInfo.GetNode()
 | 
			
		||||
	if node == nil {
 | 
			
		||||
		return nil, fmt.Errorf("failed get node with name %s", nodeName)
 | 
			
		||||
	}
 | 
			
		||||
	if !utils.IsGPUSharingNode(node) {
 | 
			
		||||
		return nil, fmt.Errorf("The node %s is not for GPU share, need skip", nodeName)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	allocatable := nodeInfo.Assume(pod)
 | 
			
		||||
	if !allocatable {
 | 
			
		||||
		return nil, fmt.Errorf("Insufficient GPU Memory in one device")
 | 
			
		||||
	} else {
 | 
			
		||||
		log.V(10).Info("info: The pod %s in the namespace %s can be scheduled on %s",
 | 
			
		||||
			pod.Name,
 | 
			
		||||
			pod.Namespace,
 | 
			
		||||
			nodeName)
 | 
			
		||||
	}
 | 
			
		||||
	return node, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p Predicate) Handler(args *schedulerapi.ExtenderArgs) *schedulerapi.ExtenderFilterResult {
 | 
			
		||||
	if args == nil || args.Pod == nil {
 | 
			
		||||
		return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("arg or pod is nil")}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pod := args.Pod
 | 
			
		||||
	var nodeNames []string
 | 
			
		||||
	if args.NodeNames != nil {
 | 
			
		||||
		nodeNames = *args.NodeNames
 | 
			
		||||
		log.V(3).Info("extender args NodeNames is not nil, result %+v", nodeNames)
 | 
			
		||||
	} else if args.Nodes != nil {
 | 
			
		||||
		for _, n := range args.Nodes.Items {
 | 
			
		||||
			nodeNames = append(nodeNames, n.Name)
 | 
			
		||||
		}
 | 
			
		||||
		log.V(3).Info("extender args Nodes is not nil, names is %+v", nodeNames)
 | 
			
		||||
	} else {
 | 
			
		||||
		return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("cannot get node names")}
 | 
			
		||||
	}
 | 
			
		||||
	canSchedule := make([]string, 0, len(nodeNames))
 | 
			
		||||
	canNotSchedule := make(map[string]string)
 | 
			
		||||
	canScheduleNodes := &v1.NodeList{}
 | 
			
		||||
 | 
			
		||||
	for _, nodeName := range nodeNames {
 | 
			
		||||
		node, err := p.checkNode(pod, nodeName, p.cache)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			canNotSchedule[nodeName] = err.Error()
 | 
			
		||||
		} else {
 | 
			
		||||
			if node != nil {
 | 
			
		||||
				canSchedule = append(canSchedule, nodeName)
 | 
			
		||||
				canScheduleNodes.Items = append(canScheduleNodes.Items, *node)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	result := schedulerapi.ExtenderFilterResult{
 | 
			
		||||
		NodeNames:   &canSchedule,
 | 
			
		||||
		Nodes:       canScheduleNodes,
 | 
			
		||||
		FailedNodes: canNotSchedule,
 | 
			
		||||
		Error:       "",
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.V(100).Info("predicate result for %s, is %+v", pod.Name, result)
 | 
			
		||||
	return &result
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										13
									
								
								gpushare-scheduler-extender/pkg/utils/const.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								gpushare-scheduler-extender/pkg/utils/const.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,13 @@
 | 
			
		||||
package utils
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	ResourceName = "rainbond.com/gpu-mem"
 | 
			
		||||
	CountName    = "rainbond.com/gpu-count"
 | 
			
		||||
 | 
			
		||||
	EnvNVGPU              = "NVIDIA_VISIBLE_DEVICES"
 | 
			
		||||
	EnvResourceIndex      = "ALIYUN_COM_GPU_MEM_IDX"
 | 
			
		||||
	EnvResourceByPod      = "ALIYUN_COM_GPU_MEM_POD"
 | 
			
		||||
	EnvResourceByDev      = "ALIYUN_COM_GPU_MEM_DEV"
 | 
			
		||||
	EnvAssignedFlag       = "ALIYUN_COM_GPU_MEM_ASSIGNED"
 | 
			
		||||
	EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
 | 
			
		||||
)
 | 
			
		||||
							
								
								
									
										30
									
								
								gpushare-scheduler-extender/pkg/utils/node.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								gpushare-scheduler-extender/pkg/utils/node.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,30 @@
 | 
			
		||||
package utils
 | 
			
		||||
 | 
			
		||||
import "k8s.io/api/core/v1"
 | 
			
		||||
 | 
			
		||||
// Is the Node for GPU sharing
 | 
			
		||||
func IsGPUSharingNode(node *v1.Node) bool {
 | 
			
		||||
	return GetTotalGPUMemory(node) > 0
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get the total GPU memory of the Node
 | 
			
		||||
func GetTotalGPUMemory(node *v1.Node) int {
 | 
			
		||||
	val, ok := node.Status.Capacity[ResourceName]
 | 
			
		||||
 | 
			
		||||
	if !ok {
 | 
			
		||||
		return 0
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return int(val.Value())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get the GPU count of the node
 | 
			
		||||
func GetGPUCountInNode(node *v1.Node) int {
 | 
			
		||||
	val, ok := node.Status.Capacity[CountName]
 | 
			
		||||
 | 
			
		||||
	if !ok {
 | 
			
		||||
		return int(0)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return int(val.Value())
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										219
									
								
								gpushare-scheduler-extender/pkg/utils/pod.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										219
									
								
								gpushare-scheduler-extender/pkg/utils/pod.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,219 @@
 | 
			
		||||
package utils
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"encoding/json"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"time"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// AssignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running).
 | 
			
		||||
func AssignedNonTerminatedPod(pod *v1.Pod) bool {
 | 
			
		||||
	if pod.DeletionTimestamp != nil {
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if len(pod.Spec.NodeName) == 0 {
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
	if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
	return true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// IsCompletePod determines if the pod is complete
 | 
			
		||||
func IsCompletePod(pod *v1.Pod) bool {
 | 
			
		||||
	if pod.DeletionTimestamp != nil {
 | 
			
		||||
		return true
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 | 
			
		||||
		return true
 | 
			
		||||
	}
 | 
			
		||||
	return false
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// IsGPUsharingPod determines if it's the pod for GPU sharing
 | 
			
		||||
func IsGPUsharingPod(pod *v1.Pod) bool {
 | 
			
		||||
	return GetGPUMemoryFromPodResource(pod) > 0
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetGPUIDFromAnnotation gets GPU ID from Annotation
 | 
			
		||||
func GetGPUIDFromAnnotation(pod *v1.Pod) int {
 | 
			
		||||
	id := -1
 | 
			
		||||
	if len(pod.ObjectMeta.Annotations) > 0 {
 | 
			
		||||
		value, found := pod.ObjectMeta.Annotations[EnvResourceIndex]
 | 
			
		||||
		if found {
 | 
			
		||||
			var err error
 | 
			
		||||
			id, err = strconv.Atoi(value)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.V(9).Info("warn: Failed due to %v for pod %s in ns %s", err, pod.Name, pod.Namespace)
 | 
			
		||||
				id = -1
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return id
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetGPUIDFromEnv gets GPU ID from Env
 | 
			
		||||
func GetGPUIDFromEnv(pod *v1.Pod) int {
 | 
			
		||||
	id := -1
 | 
			
		||||
	for _, container := range pod.Spec.Containers {
 | 
			
		||||
		id = getGPUIDFromContainer(container)
 | 
			
		||||
		if id >= 0 {
 | 
			
		||||
			return id
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return id
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getGPUIDFromContainer(container v1.Container) (devIdx int) {
 | 
			
		||||
	devIdx = -1
 | 
			
		||||
	var err error
 | 
			
		||||
loop:
 | 
			
		||||
	for _, env := range container.Env {
 | 
			
		||||
		if env.Name == EnvResourceIndex {
 | 
			
		||||
			devIdx, err = strconv.Atoi(env.Value)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				log.V(9).Info("warn: Failed due to %v for %s", err, container.Name)
 | 
			
		||||
				devIdx = -1
 | 
			
		||||
			}
 | 
			
		||||
			break loop
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return devIdx
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetGPUMemoryFromPodAnnotation gets the GPU Memory of the pod, choose the larger one between gpu memory and gpu init container memory
 | 
			
		||||
func GetGPUMemoryFromPodAnnotation(pod *v1.Pod) (gpuMemory uint) {
 | 
			
		||||
	if len(pod.ObjectMeta.Annotations) > 0 {
 | 
			
		||||
		value, found := pod.ObjectMeta.Annotations[EnvResourceByPod]
 | 
			
		||||
		if found {
 | 
			
		||||
			s, _ := strconv.Atoi(value)
 | 
			
		||||
			if s < 0 {
 | 
			
		||||
				s = 0
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			gpuMemory += uint(s)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.V(100).Info("debug: pod %s in ns %s with status %v has GPU Mem %d",
 | 
			
		||||
		pod.Name,
 | 
			
		||||
		pod.Namespace,
 | 
			
		||||
		pod.Status.Phase,
 | 
			
		||||
		gpuMemory)
 | 
			
		||||
	return gpuMemory
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetGPUMemoryFromPodEnv gets the GPU Memory of the pod, choose the larger one between gpu memory and gpu init container memory
 | 
			
		||||
func GetGPUMemoryFromPodEnv(pod *v1.Pod) (gpuMemory uint) {
 | 
			
		||||
	for _, container := range pod.Spec.Containers {
 | 
			
		||||
		gpuMemory += getGPUMemoryFromContainerEnv(container)
 | 
			
		||||
	}
 | 
			
		||||
	log.V(100).Info("debug: pod %s in ns %s with status %v has GPU Mem %d",
 | 
			
		||||
		pod.Name,
 | 
			
		||||
		pod.Namespace,
 | 
			
		||||
		pod.Status.Phase,
 | 
			
		||||
		gpuMemory)
 | 
			
		||||
	return gpuMemory
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getGPUMemoryFromContainerEnv(container v1.Container) (gpuMemory uint) {
 | 
			
		||||
	gpuMemory = 0
 | 
			
		||||
loop:
 | 
			
		||||
	for _, env := range container.Env {
 | 
			
		||||
		if env.Name == EnvResourceByPod {
 | 
			
		||||
			s, _ := strconv.Atoi(env.Value)
 | 
			
		||||
			if s < 0 {
 | 
			
		||||
				s = 0
 | 
			
		||||
			}
 | 
			
		||||
			gpuMemory = uint(s)
 | 
			
		||||
			break loop
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return gpuMemory
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetGPUMemoryFromPodResource gets GPU Memory of the Pod
 | 
			
		||||
func GetGPUMemoryFromPodResource(pod *v1.Pod) int {
 | 
			
		||||
	var total int
 | 
			
		||||
	containers := pod.Spec.Containers
 | 
			
		||||
	for _, container := range containers {
 | 
			
		||||
		if val, ok := container.Resources.Limits[ResourceName]; ok {
 | 
			
		||||
			total += int(val.Value())
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return total
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetGPUMemoryFromPodResource gets GPU Memory of the Container
 | 
			
		||||
func GetGPUMemoryFromContainerResource(container v1.Container) int {
 | 
			
		||||
	var total int
 | 
			
		||||
	if val, ok := container.Resources.Limits[ResourceName]; ok {
 | 
			
		||||
		total += int(val.Value())
 | 
			
		||||
	}
 | 
			
		||||
	return total
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetUpdatedPodEnvSpec updates pod env with devId
 | 
			
		||||
func GetUpdatedPodEnvSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) (newPod *v1.Pod) {
 | 
			
		||||
	newPod = oldPod.DeepCopy()
 | 
			
		||||
	for i, c := range newPod.Spec.Containers {
 | 
			
		||||
		gpuMem := GetGPUMemoryFromContainerResource(c)
 | 
			
		||||
 | 
			
		||||
		if gpuMem > 0 {
 | 
			
		||||
			envs := []v1.EnvVar{
 | 
			
		||||
				// v1.EnvVar{Name: EnvNVGPU, Value: fmt.Sprintf("%d", devId)},
 | 
			
		||||
				v1.EnvVar{Name: EnvResourceIndex, Value: fmt.Sprintf("%d", devId)},
 | 
			
		||||
				v1.EnvVar{Name: EnvResourceByPod, Value: fmt.Sprintf("%d", gpuMem)},
 | 
			
		||||
				v1.EnvVar{Name: EnvResourceByDev, Value: fmt.Sprintf("%d", totalGPUMemByDev)},
 | 
			
		||||
				v1.EnvVar{Name: EnvAssignedFlag, Value: "false"},
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			for _, env := range envs {
 | 
			
		||||
				newPod.Spec.Containers[i].Env = append(newPod.Spec.Containers[i].Env,
 | 
			
		||||
					env)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return newPod
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetUpdatedPodAnnotationSpec updates pod env with devId
 | 
			
		||||
func GetUpdatedPodAnnotationSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) (newPod *v1.Pod) {
 | 
			
		||||
	newPod = oldPod.DeepCopy()
 | 
			
		||||
	if len(newPod.ObjectMeta.Annotations) == 0 {
 | 
			
		||||
		newPod.ObjectMeta.Annotations = map[string]string{}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	now := time.Now()
 | 
			
		||||
	newPod.ObjectMeta.Annotations[EnvResourceIndex] = fmt.Sprintf("%d", devId)
 | 
			
		||||
	newPod.ObjectMeta.Annotations[EnvResourceByDev] = fmt.Sprintf("%d", totalGPUMemByDev)
 | 
			
		||||
	newPod.ObjectMeta.Annotations[EnvResourceByPod] = fmt.Sprintf("%d", GetGPUMemoryFromPodResource(newPod))
 | 
			
		||||
	newPod.ObjectMeta.Annotations[EnvAssignedFlag] = "false"
 | 
			
		||||
	newPod.ObjectMeta.Annotations[EnvResourceAssumeTime] = fmt.Sprintf("%d", now.UnixNano())
 | 
			
		||||
 | 
			
		||||
	return newPod
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func PatchPodAnnotationSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) ([]byte, error) {
 | 
			
		||||
	now := time.Now()
 | 
			
		||||
	patchAnnotations := map[string]interface{}{
 | 
			
		||||
		"metadata": map[string]map[string]string{"annotations": {
 | 
			
		||||
			EnvResourceIndex:      fmt.Sprintf("%d", devId),
 | 
			
		||||
			EnvResourceByDev:      fmt.Sprintf("%d", totalGPUMemByDev),
 | 
			
		||||
			EnvResourceByPod:      fmt.Sprintf("%d", GetGPUMemoryFromPodResource(oldPod)),
 | 
			
		||||
			EnvAssignedFlag:       "false",
 | 
			
		||||
			EnvResourceAssumeTime: fmt.Sprintf("%d", now.UnixNano()),
 | 
			
		||||
		}}}
 | 
			
		||||
	return json.Marshal(patchAnnotations)
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										30
									
								
								gpushare-scheduler-extender/pkg/utils/signals/signal.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								gpushare-scheduler-extender/pkg/utils/signals/signal.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,30 @@
 | 
			
		||||
package signals
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"os"
 | 
			
		||||
	"os/signal"
 | 
			
		||||
	"syscall"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var onlyOneSignalHandler = make(chan struct{})
 | 
			
		||||
 | 
			
		||||
var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM}
 | 
			
		||||
 | 
			
		||||
// SetupSignalHandler registered for SIGTERM and SIGINT. A stop channel is returned
 | 
			
		||||
// which is closed on one of these signals. If a second signal is caught, the program
 | 
			
		||||
// is terminated with exit code 1.
 | 
			
		||||
func SetupSignalHandler() (stopCh <-chan struct{}) {
 | 
			
		||||
	close(onlyOneSignalHandler) // panics when called twice
 | 
			
		||||
 | 
			
		||||
	stop := make(chan struct{})
 | 
			
		||||
	c := make(chan os.Signal, 2)
 | 
			
		||||
	signal.Notify(c, shutdownSignals...)
 | 
			
		||||
	go func() {
 | 
			
		||||
		<-c
 | 
			
		||||
		close(stop)
 | 
			
		||||
		<-c
 | 
			
		||||
		os.Exit(1) // second signal. Exit directly.
 | 
			
		||||
	}()
 | 
			
		||||
 | 
			
		||||
	return stop
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user