synchronization

This commit is contained in:
2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions

View File

@@ -0,0 +1,177 @@
package cache
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"sync"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
corelisters "k8s.io/client-go/listers/core/v1"
)
type SchedulerCache struct {
// a map from pod key to podState.
nodes map[string]*NodeInfo
// nodeLister can list/get nodes from the shared informer's store.
nodeLister corelisters.NodeLister
//
podLister corelisters.PodLister
// record the knownPod, it will be added when annotation ALIYUN_GPU_ID is added, and will be removed when complete and deleted
knownPods map[types.UID]*v1.Pod
nLock *sync.RWMutex
}
func NewSchedulerCache(nLister corelisters.NodeLister, pLister corelisters.PodLister) *SchedulerCache {
return &SchedulerCache{
nodes: make(map[string]*NodeInfo),
nodeLister: nLister,
podLister: pLister,
knownPods: make(map[types.UID]*v1.Pod),
nLock: new(sync.RWMutex),
}
}
func (cache *SchedulerCache) GetNodeinfos() []*NodeInfo {
nodes := []*NodeInfo{}
for _, n := range cache.nodes {
nodes = append(nodes, n)
}
return nodes
}
// build cache when initializing
func (cache *SchedulerCache) BuildCache() error {
log.V(5).Info("debug: begin to build scheduler cache")
pods, err := cache.podLister.List(labels.Everything())
if err != nil {
return err
} else {
for _, pod := range pods {
if utils.GetGPUMemoryFromPodAnnotation(pod) <= uint(0) {
continue
}
if len(pod.Spec.NodeName) == 0 {
continue
}
err = cache.AddOrUpdatePod(pod)
if err != nil {
return err
}
}
}
return nil
}
func (cache *SchedulerCache) GetPod(name, namespace string) (*v1.Pod, error) {
return cache.podLister.Pods(namespace).Get(name)
}
// Get known pod from the pod UID
func (cache *SchedulerCache) KnownPod(podUID types.UID) bool {
cache.nLock.RLock()
defer cache.nLock.RUnlock()
_, found := cache.knownPods[podUID]
return found
}
func (cache *SchedulerCache) AddOrUpdatePod(pod *v1.Pod) error {
log.V(100).Info("debug: Add or update pod info: %v", pod)
log.V(100).Info("debug: Node %v", cache.nodes)
if len(pod.Spec.NodeName) == 0 {
log.V(100).Info("debug: pod %s in ns %s is not assigned to any node, skip", pod.Name, pod.Namespace)
return nil
}
n, err := cache.GetNodeInfo(pod.Spec.NodeName)
if err != nil {
return err
}
podCopy := pod.DeepCopy()
if n.addOrUpdatePod(podCopy) {
// put it into known pod
cache.rememberPod(pod.UID, podCopy)
} else {
log.V(100).Info("debug: pod %s in ns %s's gpu id is %d, it's illegal, skip",
pod.Name,
pod.Namespace,
utils.GetGPUIDFromAnnotation(pod))
}
return nil
}
// The lock is in cacheNode
func (cache *SchedulerCache) RemovePod(pod *v1.Pod) {
log.V(100).Info("debug: Remove pod info: %v", pod)
log.V(100).Info("debug: Node %v", cache.nodes)
n, err := cache.GetNodeInfo(pod.Spec.NodeName)
if err == nil {
n.removePod(pod)
} else {
log.V(10).Info("debug: Failed to get node %s due to %v", pod.Spec.NodeName, err)
}
cache.forgetPod(pod.UID)
}
// Get or build nodeInfo if it doesn't exist
func (cache *SchedulerCache) GetNodeInfo(name string) (*NodeInfo, error) {
node, err := cache.nodeLister.Get(name)
if err != nil {
return nil, err
}
cache.nLock.Lock()
defer cache.nLock.Unlock()
n, ok := cache.nodes[name]
if !ok {
n = NewNodeInfo(node)
cache.nodes[name] = n
} else {
// if the existing node turn from non gpushare to gpushare
// if (utils.GetTotalGPUMemory(n.node) <= 0 && utils.GetTotalGPUMemory(node) > 0) ||
// (utils.GetGPUCountInNode(n.node) <= 0 && utils.GetGPUCountInNode(node) > 0) ||
// // if the existing node turn from gpushare to non gpushare
// (utils.GetTotalGPUMemory(n.node) > 0 && utils.GetTotalGPUMemory(node) <= 0) ||
// (utils.GetGPUCountInNode(n.node) > 0 && utils.GetGPUCountInNode(node) <= 0) {
if len(cache.nodes[name].devs) == 0 ||
utils.GetTotalGPUMemory(n.node) <= 0 ||
utils.GetGPUCountInNode(n.node) <= 0 {
log.V(10).Info("info: GetNodeInfo() need update node %s",
name)
// fix the scenario that the number of devices changes from 0 to an positive number
cache.nodes[name].Reset(node)
log.V(10).Info("info: node: %s, labels from cache after been updated: %v", n.node.Name, n.node.Labels)
} else {
log.V(10).Info("info: GetNodeInfo() uses the existing nodeInfo for %s", name)
}
log.V(100).Info("debug: node %s with devices %v", name, n.devs)
}
return n, nil
}
func (cache *SchedulerCache) forgetPod(uid types.UID) {
cache.nLock.Lock()
defer cache.nLock.Unlock()
delete(cache.knownPods, uid)
}
func (cache *SchedulerCache) rememberPod(uid types.UID, pod *v1.Pod) {
cache.nLock.Lock()
defer cache.nLock.Unlock()
cache.knownPods[pod.UID] = pod
}

View File

@@ -0,0 +1,33 @@
package cache
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
apierrors "k8s.io/apimachinery/pkg/api/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
corelisters "k8s.io/client-go/listers/core/v1"
clientgocache "k8s.io/client-go/tools/cache"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var (
ConfigMapLister corelisters.ConfigMapLister
ConfigMapInformerSynced clientgocache.InformerSynced
)
func getConfigMap(name string) *v1.ConfigMap {
configMap, err := ConfigMapLister.ConfigMaps(metav1.NamespaceSystem).Get(name)
// If we can't get the configmap just return nil. The resync will eventually
// sync things up.
if err != nil {
if !apierrors.IsNotFound(err) {
log.V(10).Info("warn: find configmap with error: %v", err)
utilruntime.HandleError(err)
}
return nil
}
return configMap
}

View File

@@ -0,0 +1,80 @@
package cache
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"sync"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
type DeviceInfo struct {
idx int
podMap map[types.UID]*v1.Pod
// usedGPUMem uint
totalGPUMem uint
rwmu *sync.RWMutex
}
func (d *DeviceInfo) GetPods() []*v1.Pod {
pods := []*v1.Pod{}
for _, pod := range d.podMap {
pods = append(pods, pod)
}
return pods
}
func newDeviceInfo(index int, totalGPUMem uint) *DeviceInfo {
return &DeviceInfo{
idx: index,
totalGPUMem: totalGPUMem,
podMap: map[types.UID]*v1.Pod{},
rwmu: new(sync.RWMutex),
}
}
func (d *DeviceInfo) GetTotalGPUMemory() uint {
return d.totalGPUMem
}
func (d *DeviceInfo) GetUsedGPUMemory() (gpuMem uint) {
log.V(100).Info("debug: GetUsedGPUMemory() podMap %v, and its address is %p", d.podMap, d)
d.rwmu.RLock()
defer d.rwmu.RUnlock()
for _, pod := range d.podMap {
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
log.V(100).Info("debug: skip the pod %s in ns %s due to its status is %s", pod.Name, pod.Namespace, pod.Status.Phase)
continue
}
// gpuMem += utils.GetGPUMemoryFromPodEnv(pod)
gpuMem += utils.GetGPUMemoryFromPodAnnotation(pod)
}
return gpuMem
}
func (d *DeviceInfo) addPod(pod *v1.Pod) {
log.V(100).Info("debug: dev.addPod() Pod %s in ns %s with the GPU ID %d will be added to device map",
pod.Name,
pod.Namespace,
d.idx)
d.rwmu.Lock()
defer d.rwmu.Unlock()
d.podMap[pod.UID] = pod
log.V(100).Info("debug: dev.addPod() after updated is %v, and its address is %p",
d.podMap,
d)
}
func (d *DeviceInfo) removePod(pod *v1.Pod) {
log.V(100).Info("debug: dev.removePod() Pod %s in ns %s with the GPU ID %d will be removed from device map",
pod.Name,
pod.Namespace,
d.idx)
d.rwmu.Lock()
defer d.rwmu.Unlock()
delete(d.podMap, pod.UID)
log.V(100).Info("debug: dev.removePod() after updated is %v, and its address is %p",
d.podMap,
d)
}

View File

@@ -0,0 +1,362 @@
package cache
import (
"context"
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"strconv"
"strings"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)
const (
OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
)
// NodeInfo is node level aggregated information.
type NodeInfo struct {
ctx context.Context
name string
node *v1.Node
devs map[int]*DeviceInfo
gpuCount int
gpuTotalMemory int
rwmu *sync.RWMutex
}
// Create Node Level
func NewNodeInfo(node *v1.Node) *NodeInfo {
log.V(10).Info("debug: NewNodeInfo() creates nodeInfo for %s", node.Name)
devMap := map[int]*DeviceInfo{}
for i := 0; i < utils.GetGPUCountInNode(node); i++ {
devMap[i] = newDeviceInfo(i, uint(utils.GetTotalGPUMemory(node)/utils.GetGPUCountInNode(node)))
}
if len(devMap) == 0 {
log.V(3).Info("warn: node %s with nodeinfo %v has no devices", node.Name, node)
}
return &NodeInfo{
ctx: context.Background(),
name: node.Name,
node: node,
devs: devMap,
gpuCount: utils.GetGPUCountInNode(node),
gpuTotalMemory: utils.GetTotalGPUMemory(node),
rwmu: new(sync.RWMutex),
}
}
// Only update the devices when the length of devs is 0
func (n *NodeInfo) Reset(node *v1.Node) {
n.gpuCount = utils.GetGPUCountInNode(node)
n.gpuTotalMemory = utils.GetTotalGPUMemory(node)
n.node = node
if n.gpuCount == 0 {
log.V(3).Info("warn: Reset for node %s but the gpu count is 0", node.Name)
}
if n.gpuTotalMemory == 0 {
log.V(3).Info("warn: Reset for node %s but the gpu total memory is 0", node.Name)
}
if len(n.devs) == 0 && n.gpuCount > 0 {
devMap := map[int]*DeviceInfo{}
for i := 0; i < utils.GetGPUCountInNode(node); i++ {
devMap[i] = newDeviceInfo(i, uint(n.gpuTotalMemory/n.gpuCount))
}
n.devs = devMap
}
log.V(3).Info("info: Reset() update nodeInfo for %s with devs %v", node.Name, n.devs)
}
func (n *NodeInfo) GetName() string {
return n.name
}
func (n *NodeInfo) GetDevs() []*DeviceInfo {
devs := make([]*DeviceInfo, n.gpuCount)
for i, dev := range n.devs {
devs[i] = dev
}
return devs
}
func (n *NodeInfo) GetNode() *v1.Node {
return n.node
}
func (n *NodeInfo) GetTotalGPUMemory() int {
return n.gpuTotalMemory
}
func (n *NodeInfo) GetGPUCount() int {
return n.gpuCount
}
func (n *NodeInfo) removePod(pod *v1.Pod) {
n.rwmu.Lock()
defer n.rwmu.Unlock()
id := utils.GetGPUIDFromAnnotation(pod)
if id >= 0 {
dev, found := n.devs[id]
if !found {
log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
} else {
dev.removePod(pod)
}
} else {
log.V(3).Info("warn: Pod %s in ns %s is not set the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
}
}
// Add the Pod which has the GPU id to the node
func (n *NodeInfo) addOrUpdatePod(pod *v1.Pod) (added bool) {
n.rwmu.Lock()
defer n.rwmu.Unlock()
id := utils.GetGPUIDFromAnnotation(pod)
log.V(3).Info("debug: addOrUpdatePod() Pod %s in ns %s with the GPU ID %d should be added to device map",
pod.Name,
pod.Namespace,
id)
if id >= 0 {
dev, found := n.devs[id]
if !found {
log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
} else {
dev.addPod(pod)
added = true
}
} else {
log.V(3).Info("warn: Pod %s in ns %s is not set the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
}
return added
}
// check if the pod can be allocated on the node
func (n *NodeInfo) Assume(pod *v1.Pod) (allocatable bool) {
allocatable = false
n.rwmu.RLock()
defer n.rwmu.RUnlock()
availableGPUs := n.getAvailableGPUs()
reqGPU := uint(utils.GetGPUMemoryFromPodResource(pod))
log.V(10).Info("debug: AvailableGPUs: %v in node %s", availableGPUs, n.name)
if len(availableGPUs) > 0 {
for devID := 0; devID < len(n.devs); devID++ {
availableGPU, ok := availableGPUs[devID]
if ok {
if availableGPU >= reqGPU {
allocatable = true
break
}
}
}
}
return allocatable
}
func (n *NodeInfo) Allocate(clientset *kubernetes.Clientset, pod *v1.Pod) (err error) {
var newPod *v1.Pod
n.rwmu.Lock()
defer n.rwmu.Unlock()
log.V(3).Info("info: Allocate() ----Begin to allocate GPU for gpu mem for pod %s in ns %s----", pod.Name, pod.Namespace)
// 1. Update the pod spec
devId, found := n.allocateGPUID(pod)
if found {
log.V(3).Info("info: Allocate() 1. Allocate GPU ID %d to pod %s in ns %s.----", devId, pod.Name, pod.Namespace)
// newPod := utils.GetUpdatedPodEnvSpec(pod, devId, nodeInfo.GetTotalGPUMemory()/nodeInfo.GetGPUCount())
//newPod = utils.GetUpdatedPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
patchedAnnotationBytes, err := utils.PatchPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
if err != nil {
return fmt.Errorf("failed to generate patched annotations,reason: %v", err)
}
newPod, err = clientset.CoreV1().Pods(pod.Namespace).Patch(n.ctx, pod.Name, types.StrategicMergePatchType, patchedAnnotationBytes, metav1.PatchOptions{})
//_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
if err != nil {
// the object has been modified; please apply your changes to the latest version and try again
if err.Error() == OptimisticLockErrorMsg {
// retry
pod, err = clientset.CoreV1().Pods(pod.Namespace).Get(n.ctx, pod.Name, metav1.GetOptions{})
if err != nil {
return err
}
// newPod = utils.GetUpdatedPodEnvSpec(pod, devId, nodeInfo.GetTotalGPUMemory()/nodeInfo.GetGPUCount())
//newPod = utils.GetUpdatedPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
//_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
newPod, err = clientset.CoreV1().Pods(pod.Namespace).Patch(n.ctx, pod.Name, types.StrategicMergePatchType, patchedAnnotationBytes, metav1.PatchOptions{})
if err != nil {
return err
}
} else {
log.V(3).Info("failed to patch pod %v", pod)
return err
}
}
} else {
err = fmt.Errorf("The node %s can't place the pod %s in ns %s,and the pod spec is %v", pod.Spec.NodeName, pod.Name, pod.Namespace, pod)
}
// 2. Bind the pod to the node
if err == nil {
binding := &v1.Binding{
ObjectMeta: metav1.ObjectMeta{Name: pod.Name, UID: pod.UID},
Target: v1.ObjectReference{Kind: "Node", Name: n.name},
}
log.V(3).Info("info: Allocate() 2. Try to bind pod %s in %s namespace to node %s with %v",
pod.Name,
pod.Namespace,
pod.Spec.NodeName,
binding)
err = clientset.CoreV1().Pods(pod.Namespace).Bind(n.ctx, binding, metav1.CreateOptions{})
if err != nil {
log.V(3).Info("warn: Failed to bind the pod %s in ns %s due to %v", pod.Name, pod.Namespace, err)
return err
}
}
// 3. update the device info if the pod is update successfully
if err == nil {
log.V(3).Info("info: Allocate() 3. Try to add pod %s in ns %s to dev %d",
pod.Name,
pod.Namespace,
devId)
dev, found := n.devs[devId]
if !found {
log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, devId, n.name)
} else {
dev.addPod(newPod)
}
}
log.V(3).Info("info: Allocate() ----End to allocate GPU for gpu mem for pod %s in ns %s----", pod.Name, pod.Namespace)
return err
}
// allocate the GPU ID to the pod
func (n *NodeInfo) allocateGPUID(pod *v1.Pod) (candidateDevID int, found bool) {
reqGPU := uint(0)
found = false
candidateDevID = -1
candidateGPUMemory := uint(0)
availableGPUs := n.getAvailableGPUs()
reqGPU = uint(utils.GetGPUMemoryFromPodResource(pod))
if reqGPU > uint(0) {
log.V(3).Info("info: reqGPU for pod %s in ns %s: %d", pod.Name, pod.Namespace, reqGPU)
log.V(3).Info("info: AvailableGPUs: %v in node %s", availableGPUs, n.name)
if len(availableGPUs) > 0 {
for devID := 0; devID < len(n.devs); devID++ {
availableGPU, ok := availableGPUs[devID]
if ok {
if availableGPU >= reqGPU {
if candidateDevID == -1 || candidateGPUMemory > availableGPU {
candidateDevID = devID
candidateGPUMemory = availableGPU
}
found = true
}
}
}
}
if found {
log.V(3).Info("info: Find candidate dev id %d for pod %s in ns %s successfully.",
candidateDevID,
pod.Name,
pod.Namespace)
} else {
log.V(3).Info("warn: Failed to find available GPUs %d for the pod %s in the namespace %s",
reqGPU,
pod.Name,
pod.Namespace)
}
}
return candidateDevID, found
}
func (n *NodeInfo) getAvailableGPUs() (availableGPUs map[int]uint) {
allGPUs := n.getAllGPUs()
usedGPUs := n.getUsedGPUs()
unhealthyGPUs := n.getUnhealthyGPUs()
availableGPUs = map[int]uint{}
for id, totalGPUMem := range allGPUs {
if usedGPUMem, found := usedGPUs[id]; found {
availableGPUs[id] = totalGPUMem - usedGPUMem
}
}
log.V(3).Info("info: available GPU list %v before removing unhealty GPUs", availableGPUs)
for id, _ := range unhealthyGPUs {
log.V(3).Info("info: delete dev %d from availble GPU list", id)
delete(availableGPUs, id)
}
log.V(3).Info("info: available GPU list %v after removing unhealty GPUs", availableGPUs)
return availableGPUs
}
// device index: gpu memory
func (n *NodeInfo) getUsedGPUs() (usedGPUs map[int]uint) {
usedGPUs = map[int]uint{}
for _, dev := range n.devs {
usedGPUs[dev.idx] = dev.GetUsedGPUMemory()
}
log.V(3).Info("info: getUsedGPUs: %v in node %s, and devs %v", usedGPUs, n.name, n.devs)
return usedGPUs
}
// device index: gpu memory
func (n *NodeInfo) getAllGPUs() (allGPUs map[int]uint) {
allGPUs = map[int]uint{}
for _, dev := range n.devs {
allGPUs[dev.idx] = dev.totalGPUMem
}
log.V(3).Info("info: getAllGPUs: %v in node %s, and dev %v", allGPUs, n.name, n.devs)
return allGPUs
}
// getUnhealthyGPUs get the unhealthy GPUs from configmap
func (n *NodeInfo) getUnhealthyGPUs() (unhealthyGPUs map[int]bool) {
unhealthyGPUs = map[int]bool{}
name := fmt.Sprintf("unhealthy-gpu-%s", n.GetName())
log.V(3).Info("info: try to find unhealthy node %s", name)
cm := getConfigMap(name)
if cm == nil {
return
}
if devicesStr, found := cm.Data["gpus"]; found {
log.V(3).Info("warn: the unhelathy gpus %s", devicesStr)
idsStr := strings.Split(devicesStr, ",")
for _, sid := range idsStr {
id, err := strconv.Atoi(sid)
if err != nil {
log.V(3).Info("warn: failed to parse id %s due to %v", sid, err)
}
unhealthyGPUs[id] = true
}
} else {
log.V(3).Info("info: skip, because there are no unhealthy gpus")
}
return
}

View File

@@ -0,0 +1,346 @@
package gpushare
import (
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"golang.org/x/time/rate"
"time"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
clientgocache "k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/record"
)
var (
KeyFunc = clientgocache.DeletionHandlingMetaNamespaceKeyFunc
)
type Controller struct {
clientset *kubernetes.Clientset
// podLister can list/get pods from the shared informer's store.
podLister corelisters.PodLister
// nodeLister can list/get nodes from the shared informer's store.
nodeLister corelisters.NodeLister
// podQueue is a rate limited work queue. This is used to queue work to be
// processed instead of performing it as soon as a change happens. This
// means we can ensure we only process a fixed amount of resources at a
// time, and makes it easy to ensure we are never processing the same item
// simultaneously in two different workers.
podQueue workqueue.RateLimitingInterface
// recorder is an event recorder for recording Event resources to the
// Kubernetes API.
recorder record.EventRecorder
// podInformerSynced returns true if the pod store has been synced at least once.
podInformerSynced clientgocache.InformerSynced
// nodeInformerSynced returns true if the service store has been synced at least once.
nodeInformerSynced clientgocache.InformerSynced
schedulerCache *cache.SchedulerCache
// The cache to store the pod to be removed
removePodCache map[string]*v1.Pod
}
func NewController(clientset *kubernetes.Clientset, kubeInformerFactory kubeinformers.SharedInformerFactory, stopCh <-chan struct{}) (*Controller, error) {
log.V(100).Info("info: Creating event broadcaster")
eventBroadcaster := record.NewBroadcaster()
// eventBroadcaster.StartLogging(log.Infof)
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientset.CoreV1().Events("")})
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "gpushare-schd-extender"})
rateLimiter := workqueue.NewMaxOfRateLimiter(
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 1000*time.Second),
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(100), 500)},
)
c := &Controller{
clientset: clientset,
podQueue: workqueue.NewNamedRateLimitingQueue(rateLimiter, "podQueue"),
recorder: recorder,
removePodCache: map[string]*v1.Pod{},
}
// Create pod informer.
podInformer := kubeInformerFactory.Core().V1().Pods()
podInformer.Informer().AddEventHandler(clientgocache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
// log.V(100).Info("debug: added pod %s in ns %s", t.Name, t.Namespace)
return utils.IsGPUsharingPod(t)
case clientgocache.DeletedFinalStateUnknown:
if pod, ok := t.Obj.(*v1.Pod); ok {
log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
return utils.IsGPUsharingPod(pod)
}
runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
return false
default:
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
return false
}
},
Handler: clientgocache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToCache,
UpdateFunc: c.updatePodInCache,
DeleteFunc: c.deletePodFromCache,
},
})
c.podLister = podInformer.Lister()
c.podInformerSynced = podInformer.Informer().HasSynced
// Create node informer
nodeInformer := kubeInformerFactory.Core().V1().Nodes()
c.nodeLister = nodeInformer.Lister()
c.nodeInformerSynced = nodeInformer.Informer().HasSynced
// Create configMap informer
cmInformer := kubeInformerFactory.Core().V1().ConfigMaps()
cache.ConfigMapLister = cmInformer.Lister()
cache.ConfigMapInformerSynced = cmInformer.Informer().HasSynced
// Start informer goroutines.
go kubeInformerFactory.Start(stopCh)
// Create scheduler Cache
c.schedulerCache = cache.NewSchedulerCache(c.nodeLister, c.podLister)
log.V(100).Info("info: begin to wait for cache")
if ok := clientgocache.WaitForCacheSync(stopCh, c.nodeInformerSynced); !ok {
return nil, fmt.Errorf("failed to wait for node caches to sync")
} else {
log.V(100).Info("info: init the node cache successfully")
}
if ok := clientgocache.WaitForCacheSync(stopCh, c.podInformerSynced); !ok {
return nil, fmt.Errorf("failed to wait for pod caches to sync")
} else {
log.V(100).Info("info: init the pod cache successfully")
}
if ok := clientgocache.WaitForCacheSync(stopCh, cache.ConfigMapInformerSynced); !ok {
return nil, fmt.Errorf("failed to wait for configmap caches to sync")
} else {
log.V(100).Info("info: init the configmap cache successfully")
}
log.V(100).Info("info: end to wait for cache")
return c, nil
}
func (c *Controller) BuildCache() error {
return c.schedulerCache.BuildCache()
}
func (c *Controller) GetSchedulerCache() *cache.SchedulerCache {
return c.schedulerCache
}
// Run will set up the event handlers
func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
defer runtime.HandleCrash()
defer c.podQueue.ShutDown()
log.V(9).Info("info: Starting GPU Sharing Controller.")
log.V(9).Info("info: Waiting for informer caches to sync")
log.V(9).Info("info: Starting %v workers.", threadiness)
for i := 0; i < threadiness; i++ {
go wait.Until(c.runWorker, time.Second, stopCh)
}
log.V(3).Info("info: Started workers")
<-stopCh
log.V(3).Info("info: Shutting down workers")
return nil
}
// runWorker is a long-running function that will continually call the
// processNextWorkItem function in order to read and process a message on the
// workqueue.
func (c *Controller) runWorker() {
for c.processNextWorkItem() {
}
}
// syncPod will sync the pod with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (c *Controller) syncPod(key string) (forget bool, err error) {
ns, name, err := clientgocache.SplitMetaNamespaceKey(key)
log.V(9).Info("debug: begin to sync gpushare pod %s in ns %s", name, ns)
if err != nil {
return false, err
}
pod, err := c.podLister.Pods(ns).Get(name)
switch {
case errors.IsNotFound(err):
log.V(10).Info("debug: pod %s in ns %s has been deleted.", name, ns)
pod, found := c.removePodCache[key]
if found {
c.schedulerCache.RemovePod(pod)
delete(c.removePodCache, key)
}
case err != nil:
log.V(10).Info("warn: unable to retrieve pod %v from the store: %v", key, err)
default:
if utils.IsCompletePod(pod) {
log.V(10).Info("debug: pod %s in ns %s has completed.", name, ns)
c.schedulerCache.RemovePod(pod)
} else {
err := c.schedulerCache.AddOrUpdatePod(pod)
if err != nil {
return false, err
}
}
}
return true, nil
}
// processNextWorkItem will read a single work item off the podQueue and
// attempt to process it.
func (c *Controller) processNextWorkItem() bool {
log.V(100).Info("debug: begin processNextWorkItem()")
key, quit := c.podQueue.Get()
if quit {
return false
}
defer c.podQueue.Done(key)
defer log.V(100).Info("debug: end processNextWorkItem()")
forget, err := c.syncPod(key.(string))
if err == nil {
if forget {
c.podQueue.Forget(key)
}
return true
}
log.V(3).Info("Error syncing pods: %v", err)
runtime.HandleError(fmt.Errorf("Error syncing pod: %v", err))
c.podQueue.AddRateLimited(key)
return true
}
func (c *Controller) addPodToCache(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", obj)
return
}
// if !assignedNonTerminatedPod(t) {
// log.V(100).Info("debug: skip pod %s due to it's terminated.", pod.Name)
// return
// }
podKey, err := KeyFunc(pod)
if err != nil {
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
return
}
c.podQueue.Add(podKey)
// NOTE: Updating equivalence cache of addPodToCache has been
// handled optimistically in: pkg/scheduler/scheduler.go#assume()
}
func (c *Controller) updatePodInCache(oldObj, newObj interface{}) {
oldPod, ok := oldObj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert oldObj to *v1.Pod: %v", oldObj)
return
}
newPod, ok := newObj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert newObj to *v1.Pod: %v", newObj)
return
}
needUpdate := false
podUID := oldPod.UID
// 1. Need update when pod is turned to complete or failed
if c.schedulerCache.KnownPod(podUID) && utils.IsCompletePod(newPod) {
needUpdate = true
}
// 2. Need update when it's unknown pod, and GPU annotation has been set
if !c.schedulerCache.KnownPod(podUID) && utils.GetGPUIDFromAnnotation(newPod) >= 0 {
needUpdate = true
}
if needUpdate {
podKey, err := KeyFunc(newPod)
if err != nil {
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
return
}
log.V(3).Info("info: Need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
newPod.Name,
newPod.Namespace,
oldPod.Status.Phase,
newPod.Status.Phase,
oldPod.Annotations,
newPod.Annotations)
c.podQueue.Add(podKey)
} else {
log.V(100).Info("debug: No need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
newPod.Name,
newPod.Namespace,
oldPod.Status.Phase,
newPod.Status.Phase,
oldPod.Annotations,
newPod.Annotations)
}
return
}
func (c *Controller) deletePodFromCache(obj interface{}) {
var pod *v1.Pod
switch t := obj.(type) {
case *v1.Pod:
pod = t
case clientgocache.DeletedFinalStateUnknown:
var ok bool
pod, ok = t.Obj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t.Obj)
return
}
default:
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t)
return
}
log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
podKey, err := KeyFunc(pod)
if err != nil {
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
return
}
c.podQueue.Add(podKey)
c.removePodCache[podKey] = pod
}

View File

@@ -0,0 +1,70 @@
package log
import (
"fmt"
"os"
"sync"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
type levelLogger struct {
level *int32
mu sync.Mutex
log *zap.Logger
}
type verbose bool
var l *levelLogger
func NewLoggerWithLevel(level int32, option ...zap.Option) {
cfg := zap.NewProductionEncoderConfig()
cfg.EncodeTime = zapcore.ISO8601TimeEncoder
core := zapcore.NewCore(
zapcore.NewJSONEncoder(cfg),
zapcore.Lock(os.Stdout),
zap.NewAtomicLevel(),
)
if option == nil {
option = []zap.Option{}
}
option = append(option, zap.AddCaller(), zap.AddCallerSkip(1))
l = &levelLogger{
level: &level,
mu: sync.Mutex{},
log: zap.New(core, option...),
}
}
/*
V for log level, normal usage example
globalLogger default level 3, debug level 10
example level:
api request 4
api response 9
services func 5
db error 9
db query 11
db result 15
*/
func V(level int32) verbose {
return level < *l.level
}
func (v verbose) Info(format string, args ...interface{}) {
if v {
l.log.Info(fmt.Sprintf(format, args...))
}
}
func Fatal(format string, args ...interface{}) {
l.log.Fatal(fmt.Sprintf(format, args...))
}

View File

@@ -0,0 +1,64 @@
package routes
import (
"net/http"
"net/http/pprof"
"github.com/julienschmidt/httprouter"
)
func AddPProf(r *httprouter.Router) {
r.GET("/debug/pprof/", index)
r.GET("/debug/pprof/cmdline/", cmdline)
r.GET("/debug/pprof/profile/", profile)
r.GET("/debug/pprof/symbol/", symbol)
r.GET("/debug/pprof/trace/", trace)
r.GET("/debug/pprof/heap/", heap)
r.GET("/debug/pprof/goroutine/", goroutine)
r.GET("/debug/pprof/block/", block)
r.GET("/debug/pprof/threadcreate/", threadcreate)
r.GET("/debug/pprof/mutex/", mutex)
}
// profiling tools handlers
func index(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Index(w, r)
}
func cmdline(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Cmdline(w, r)
}
func profile(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Profile(w, r)
}
func symbol(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Symbol(w, r)
}
func trace(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Trace(w, r)
}
func heap(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Handler("heap").ServeHTTP(w, r)
}
func goroutine(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Handler("goroutine").ServeHTTP(w, r)
}
func block(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Handler("block").ServeHTTP(w, r)
}
func threadcreate(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Handler("threadcreate").ServeHTTP(w, r)
}
func mutex(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
pprof.Handler("mutex").ServeHTTP(w, r)
}

View File

@@ -0,0 +1,181 @@
package routes
import (
"bytes"
"encoding/json"
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"io"
"net/http"
"time"
"github.com/julienschmidt/httprouter"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/scheduler"
schedulerapi "k8s.io/kube-scheduler/extender/v1"
)
const (
versionPath = "/version"
apiPrefix = "/gpushare-scheduler"
bindPrefix = apiPrefix + "/bind"
predicatesPrefix = apiPrefix + "/filter"
inspectPrefix = apiPrefix + "/inspect/:nodename"
inspectListPrefix = apiPrefix + "/inspect"
)
var (
version = "0.1.0"
// mu sync.RWMutex
)
func checkBody(w http.ResponseWriter, r *http.Request) {
if r.Body == nil {
http.Error(w, "Please send a request body", 400)
return
}
}
func InspectRoute(inspect *scheduler.Inspect) httprouter.Handle {
return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
result := inspect.Handler(ps.ByName("nodename"))
if resultBody, err := json.Marshal(result); err != nil {
// panic(err)
log.V(3).Info("warn: Failed due to %v", err)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusInternalServerError)
errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
w.Write([]byte(errMsg))
} else {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write(resultBody)
}
}
}
func PredicateRoute(predicate *scheduler.Predicate) httprouter.Handle {
return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
checkBody(w, r)
// mu.RLock()
// defer mu.RUnlock()
var buf bytes.Buffer
body := io.TeeReader(r.Body, &buf)
var extenderArgs schedulerapi.ExtenderArgs
var extenderFilterResult *schedulerapi.ExtenderFilterResult
if err := json.NewDecoder(body).Decode(&extenderArgs); err != nil {
log.V(3).Info("warn: failed to parse request due to error %v", err)
extenderFilterResult = &schedulerapi.ExtenderFilterResult{
Nodes: nil,
FailedNodes: nil,
Error: err.Error(),
}
} else {
log.V(90).Info("debug: gpusharingfilter ExtenderArgs =%v", extenderArgs)
extenderFilterResult = predicate.Handler(&extenderArgs)
}
if resultBody, err := json.Marshal(extenderFilterResult); err != nil {
// panic(err)
log.V(3).Info("warn: Failed due to %v", err)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusInternalServerError)
errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
w.Write([]byte(errMsg))
} else {
log.V(100).Info("predicate: %s, extenderFilterResult = %s ", predicate.Name, resultBody)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write(resultBody)
}
}
}
func BindRoute(bind *scheduler.Bind) httprouter.Handle {
return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
checkBody(w, r)
// mu.Lock()
// defer mu.Unlock()
var buf bytes.Buffer
body := io.TeeReader(r.Body, &buf)
var extenderBindingArgs schedulerapi.ExtenderBindingArgs
var extenderBindingResult *schedulerapi.ExtenderBindingResult
failed := false
if err := json.NewDecoder(body).Decode(&extenderBindingArgs); err != nil {
extenderBindingResult = &schedulerapi.ExtenderBindingResult{
Error: err.Error(),
}
failed = true
} else {
log.V(10).Info("debug: gpusharingBind ExtenderArgs =%v", extenderBindingArgs)
extenderBindingResult = bind.Handler(extenderBindingArgs)
}
if len(extenderBindingResult.Error) > 0 {
failed = true
}
if resultBody, err := json.Marshal(extenderBindingResult); err != nil {
log.V(3).Info("warn: Failed due to %v", err)
// panic(err)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusInternalServerError)
errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
w.Write([]byte(errMsg))
} else {
log.V(3).Info("info: extenderBindingResult = %s", resultBody)
w.Header().Set("Content-Type", "application/json")
if failed {
w.WriteHeader(http.StatusInternalServerError)
} else {
w.WriteHeader(http.StatusOK)
}
w.Write(resultBody)
}
}
}
func VersionRoute(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
fmt.Fprint(w, fmt.Sprint(version))
}
func AddVersion(router *httprouter.Router) {
router.GET(versionPath, DebugLogging(VersionRoute, versionPath))
}
func DebugLogging(h httprouter.Handle, path string) httprouter.Handle {
return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
log.V(90).Info("path: %s, request body = %s", path, r.Body)
startTime := time.Now()
h(w, r, p)
log.V(90).Info("path: %s, response: %v, cost_time: %v", path, w, time.Now().Sub(startTime))
}
}
func AddPredicate(router *httprouter.Router, predicate *scheduler.Predicate) {
// path := predicatesPrefix + "/" + predicate.Name
router.POST(predicatesPrefix, DebugLogging(PredicateRoute(predicate), predicatesPrefix))
}
func AddBind(router *httprouter.Router, bind *scheduler.Bind) {
if handle, _, _ := router.Lookup("POST", bindPrefix); handle != nil {
log.V(3).Info("warning: AddBind was called more then once!")
} else {
router.POST(bindPrefix, DebugLogging(BindRoute(bind), bindPrefix))
}
}
func AddInspect(router *httprouter.Router, inspect *scheduler.Inspect) {
router.GET(inspectPrefix, DebugLogging(InspectRoute(inspect), inspectPrefix))
router.GET(inspectListPrefix, DebugLogging(InspectRoute(inspect), inspectListPrefix))
}

View File

@@ -0,0 +1,26 @@
package scheduler
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"k8s.io/apimachinery/pkg/types"
schedulerapi "k8s.io/kube-scheduler/extender/v1"
)
// Bind is responsible for binding node and pod
type Bind struct {
Name string
Func func(podName string, podNamespace string, podUID types.UID, node string, cache *cache.SchedulerCache) error
cache *cache.SchedulerCache
}
// Handler handles the Bind request
func (b Bind) Handler(args schedulerapi.ExtenderBindingArgs) *schedulerapi.ExtenderBindingResult {
err := b.Func(args.PodName, args.PodNamespace, args.PodUID, args.Node, b.cache)
errMsg := ""
if err != nil {
errMsg = err.Error()
}
return &schedulerapi.ExtenderBindingResult{
Error: errMsg,
}
}

View File

@@ -0,0 +1,71 @@
package scheduler
import (
"context"
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
)
const (
OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
)
func NewGPUShareBind(ctx context.Context, clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Bind {
return &Bind{
Name: "gpusharingbinding",
Func: func(name string, namespace string, podUID types.UID, node string, c *cache.SchedulerCache) error {
pod, err := getPod(ctx, name, namespace, podUID, clientset, c)
if err != nil {
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
return err
}
nodeInfo, err := c.GetNodeInfo(node)
if err != nil {
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
return err
}
err = nodeInfo.Allocate(clientset, pod)
if err != nil {
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
return err
}
return nil
},
cache: c,
}
}
func getPod(ctx context.Context, name string, namespace string, podUID types.UID, clientset *kubernetes.Clientset, c *cache.SchedulerCache) (pod *v1.Pod, err error) {
pod, err = c.GetPod(name, namespace)
if errors.IsNotFound(err) {
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil {
return nil, err
}
} else if err != nil {
return nil, err
}
if pod.UID != podUID {
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil {
return nil, err
}
if pod.UID != podUID {
return nil, fmt.Errorf("The pod %s in ns %s's uid is %v, and it's not equal with expected %v",
name,
namespace,
pod.UID,
podUID)
}
}
return pod, nil
}

View File

@@ -0,0 +1,42 @@
package scheduler
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
)
func NewGPUShareInspect(c *cache.SchedulerCache) *Inspect {
return &Inspect{
Name: "gpushareinspect",
cache: c,
}
}
type Result struct {
Nodes []*Node `json:"nodes"`
Error string `json:"error,omitempty"`
}
type Node struct {
Name string `json:"name"`
TotalGPU uint `json:"totalGPU"`
UsedGPU uint `json:"usedGPU"`
Devices []*Device `json:"devs"`
}
type Device struct {
ID int `json:"id"`
TotalGPU uint `json:"totalGPU"`
UsedGPU uint `json:"usedGPU"`
Pods []*Pod `json:"pods"`
}
type Pod struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
UsedGPU int `json:"usedGPU"`
}
type Inspect struct {
Name string
cache *cache.SchedulerCache
}

View File

@@ -0,0 +1,10 @@
package scheduler
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"k8s.io/client-go/kubernetes"
)
func NewGPUsharePredicate(clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Predicate {
return &Predicate{Name: "gpusharingfilter", cache: c}
}

View File

@@ -0,0 +1,69 @@
package scheduler
import (
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
)
func (in Inspect) Handler(name string) *Result {
nodes := []*Node{}
errMsg := ""
if len(name) == 0 {
nodeInfos := in.cache.GetNodeinfos()
for _, info := range nodeInfos {
nodes = append(nodes, buildNode(info))
}
} else {
node, err := in.cache.GetNodeInfo(name)
if err != nil {
errMsg = err.Error()
}
// nodeInfos = append(nodeInfos, node)
nodes = append(nodes, buildNode(node))
}
return &Result{
Nodes: nodes,
Error: errMsg,
}
}
func buildNode(info *cache.NodeInfo) *Node {
devInfos := info.GetDevs()
devs := []*Device{}
var usedGPU uint
for i, devInfo := range devInfos {
dev := &Device{
ID: i,
TotalGPU: devInfo.GetTotalGPUMemory(),
UsedGPU: devInfo.GetUsedGPUMemory(),
}
podInfos := devInfo.GetPods()
pods := []*Pod{}
for _, podInfo := range podInfos {
if utils.AssignedNonTerminatedPod(podInfo) {
pod := &Pod{
Namespace: podInfo.Namespace,
Name: podInfo.Name,
UsedGPU: utils.GetGPUMemoryFromPodResource(podInfo),
}
pods = append(pods, pod)
}
}
dev.Pods = pods
devs = append(devs, dev)
usedGPU += devInfo.GetUsedGPUMemory()
}
return &Node{
Name: info.GetName(),
TotalGPU: uint(info.GetTotalGPUMemory()),
UsedGPU: usedGPU,
Devices: devs,
}
}

View File

@@ -0,0 +1,87 @@
package scheduler
import (
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
"k8s.io/api/core/v1"
schedulerapi "k8s.io/kube-scheduler/extender/v1"
)
type Predicate struct {
Name string
cache *cache.SchedulerCache
}
func (p Predicate) checkNode(pod *v1.Pod, nodeName string, c *cache.SchedulerCache) (*v1.Node, error) {
log.V(10).Info("info: check if the pod name %s can be scheduled on node %s", pod.Name, nodeName)
nodeInfo, err := c.GetNodeInfo(nodeName)
if err != nil {
return nil, err
}
node := nodeInfo.GetNode()
if node == nil {
return nil, fmt.Errorf("failed get node with name %s", nodeName)
}
if !utils.IsGPUSharingNode(node) {
return nil, fmt.Errorf("The node %s is not for GPU share, need skip", nodeName)
}
allocatable := nodeInfo.Assume(pod)
if !allocatable {
return nil, fmt.Errorf("Insufficient GPU Memory in one device")
} else {
log.V(10).Info("info: The pod %s in the namespace %s can be scheduled on %s",
pod.Name,
pod.Namespace,
nodeName)
}
return node, nil
}
func (p Predicate) Handler(args *schedulerapi.ExtenderArgs) *schedulerapi.ExtenderFilterResult {
if args == nil || args.Pod == nil {
return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("arg or pod is nil")}
}
pod := args.Pod
var nodeNames []string
if args.NodeNames != nil {
nodeNames = *args.NodeNames
log.V(3).Info("extender args NodeNames is not nil, result %+v", nodeNames)
} else if args.Nodes != nil {
for _, n := range args.Nodes.Items {
nodeNames = append(nodeNames, n.Name)
}
log.V(3).Info("extender args Nodes is not nil, names is %+v", nodeNames)
} else {
return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("cannot get node names")}
}
canSchedule := make([]string, 0, len(nodeNames))
canNotSchedule := make(map[string]string)
canScheduleNodes := &v1.NodeList{}
for _, nodeName := range nodeNames {
node, err := p.checkNode(pod, nodeName, p.cache)
if err != nil {
canNotSchedule[nodeName] = err.Error()
} else {
if node != nil {
canSchedule = append(canSchedule, nodeName)
canScheduleNodes.Items = append(canScheduleNodes.Items, *node)
}
}
}
result := schedulerapi.ExtenderFilterResult{
NodeNames: &canSchedule,
Nodes: canScheduleNodes,
FailedNodes: canNotSchedule,
Error: "",
}
log.V(100).Info("predicate result for %s, is %+v", pod.Name, result)
return &result
}

View File

@@ -0,0 +1,13 @@
package utils
const (
ResourceName = "rainbond.com/gpu-mem"
CountName = "rainbond.com/gpu-count"
EnvNVGPU = "NVIDIA_VISIBLE_DEVICES"
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
)

View File

@@ -0,0 +1,30 @@
package utils
import "k8s.io/api/core/v1"
// Is the Node for GPU sharing
func IsGPUSharingNode(node *v1.Node) bool {
return GetTotalGPUMemory(node) > 0
}
// Get the total GPU memory of the Node
func GetTotalGPUMemory(node *v1.Node) int {
val, ok := node.Status.Capacity[ResourceName]
if !ok {
return 0
}
return int(val.Value())
}
// Get the GPU count of the node
func GetGPUCountInNode(node *v1.Node) int {
val, ok := node.Status.Capacity[CountName]
if !ok {
return int(0)
}
return int(val.Value())
}

View File

@@ -0,0 +1,219 @@
package utils
import (
"encoding/json"
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
v1 "k8s.io/api/core/v1"
"strconv"
"time"
)
// AssignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running).
func AssignedNonTerminatedPod(pod *v1.Pod) bool {
if pod.DeletionTimestamp != nil {
return false
}
if len(pod.Spec.NodeName) == 0 {
return false
}
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
return false
}
return true
}
// IsCompletePod determines if the pod is complete
func IsCompletePod(pod *v1.Pod) bool {
if pod.DeletionTimestamp != nil {
return true
}
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
return true
}
return false
}
// IsGPUsharingPod determines if it's the pod for GPU sharing
func IsGPUsharingPod(pod *v1.Pod) bool {
return GetGPUMemoryFromPodResource(pod) > 0
}
// GetGPUIDFromAnnotation gets GPU ID from Annotation
func GetGPUIDFromAnnotation(pod *v1.Pod) int {
id := -1
if len(pod.ObjectMeta.Annotations) > 0 {
value, found := pod.ObjectMeta.Annotations[EnvResourceIndex]
if found {
var err error
id, err = strconv.Atoi(value)
if err != nil {
log.V(9).Info("warn: Failed due to %v for pod %s in ns %s", err, pod.Name, pod.Namespace)
id = -1
}
}
}
return id
}
// GetGPUIDFromEnv gets GPU ID from Env
func GetGPUIDFromEnv(pod *v1.Pod) int {
id := -1
for _, container := range pod.Spec.Containers {
id = getGPUIDFromContainer(container)
if id >= 0 {
return id
}
}
return id
}
func getGPUIDFromContainer(container v1.Container) (devIdx int) {
devIdx = -1
var err error
loop:
for _, env := range container.Env {
if env.Name == EnvResourceIndex {
devIdx, err = strconv.Atoi(env.Value)
if err != nil {
log.V(9).Info("warn: Failed due to %v for %s", err, container.Name)
devIdx = -1
}
break loop
}
}
return devIdx
}
// GetGPUMemoryFromPodAnnotation gets the GPU Memory of the pod, choose the larger one between gpu memory and gpu init container memory
func GetGPUMemoryFromPodAnnotation(pod *v1.Pod) (gpuMemory uint) {
if len(pod.ObjectMeta.Annotations) > 0 {
value, found := pod.ObjectMeta.Annotations[EnvResourceByPod]
if found {
s, _ := strconv.Atoi(value)
if s < 0 {
s = 0
}
gpuMemory += uint(s)
}
}
log.V(100).Info("debug: pod %s in ns %s with status %v has GPU Mem %d",
pod.Name,
pod.Namespace,
pod.Status.Phase,
gpuMemory)
return gpuMemory
}
// GetGPUMemoryFromPodEnv gets the GPU Memory of the pod, choose the larger one between gpu memory and gpu init container memory
func GetGPUMemoryFromPodEnv(pod *v1.Pod) (gpuMemory uint) {
for _, container := range pod.Spec.Containers {
gpuMemory += getGPUMemoryFromContainerEnv(container)
}
log.V(100).Info("debug: pod %s in ns %s with status %v has GPU Mem %d",
pod.Name,
pod.Namespace,
pod.Status.Phase,
gpuMemory)
return gpuMemory
}
func getGPUMemoryFromContainerEnv(container v1.Container) (gpuMemory uint) {
gpuMemory = 0
loop:
for _, env := range container.Env {
if env.Name == EnvResourceByPod {
s, _ := strconv.Atoi(env.Value)
if s < 0 {
s = 0
}
gpuMemory = uint(s)
break loop
}
}
return gpuMemory
}
// GetGPUMemoryFromPodResource gets GPU Memory of the Pod
func GetGPUMemoryFromPodResource(pod *v1.Pod) int {
var total int
containers := pod.Spec.Containers
for _, container := range containers {
if val, ok := container.Resources.Limits[ResourceName]; ok {
total += int(val.Value())
}
}
return total
}
// GetGPUMemoryFromPodResource gets GPU Memory of the Container
func GetGPUMemoryFromContainerResource(container v1.Container) int {
var total int
if val, ok := container.Resources.Limits[ResourceName]; ok {
total += int(val.Value())
}
return total
}
// GetUpdatedPodEnvSpec updates pod env with devId
func GetUpdatedPodEnvSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) (newPod *v1.Pod) {
newPod = oldPod.DeepCopy()
for i, c := range newPod.Spec.Containers {
gpuMem := GetGPUMemoryFromContainerResource(c)
if gpuMem > 0 {
envs := []v1.EnvVar{
// v1.EnvVar{Name: EnvNVGPU, Value: fmt.Sprintf("%d", devId)},
v1.EnvVar{Name: EnvResourceIndex, Value: fmt.Sprintf("%d", devId)},
v1.EnvVar{Name: EnvResourceByPod, Value: fmt.Sprintf("%d", gpuMem)},
v1.EnvVar{Name: EnvResourceByDev, Value: fmt.Sprintf("%d", totalGPUMemByDev)},
v1.EnvVar{Name: EnvAssignedFlag, Value: "false"},
}
for _, env := range envs {
newPod.Spec.Containers[i].Env = append(newPod.Spec.Containers[i].Env,
env)
}
}
}
return newPod
}
// GetUpdatedPodAnnotationSpec updates pod env with devId
func GetUpdatedPodAnnotationSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) (newPod *v1.Pod) {
newPod = oldPod.DeepCopy()
if len(newPod.ObjectMeta.Annotations) == 0 {
newPod.ObjectMeta.Annotations = map[string]string{}
}
now := time.Now()
newPod.ObjectMeta.Annotations[EnvResourceIndex] = fmt.Sprintf("%d", devId)
newPod.ObjectMeta.Annotations[EnvResourceByDev] = fmt.Sprintf("%d", totalGPUMemByDev)
newPod.ObjectMeta.Annotations[EnvResourceByPod] = fmt.Sprintf("%d", GetGPUMemoryFromPodResource(newPod))
newPod.ObjectMeta.Annotations[EnvAssignedFlag] = "false"
newPod.ObjectMeta.Annotations[EnvResourceAssumeTime] = fmt.Sprintf("%d", now.UnixNano())
return newPod
}
func PatchPodAnnotationSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) ([]byte, error) {
now := time.Now()
patchAnnotations := map[string]interface{}{
"metadata": map[string]map[string]string{"annotations": {
EnvResourceIndex: fmt.Sprintf("%d", devId),
EnvResourceByDev: fmt.Sprintf("%d", totalGPUMemByDev),
EnvResourceByPod: fmt.Sprintf("%d", GetGPUMemoryFromPodResource(oldPod)),
EnvAssignedFlag: "false",
EnvResourceAssumeTime: fmt.Sprintf("%d", now.UnixNano()),
}}}
return json.Marshal(patchAnnotations)
}

View File

@@ -0,0 +1,30 @@
package signals
import (
"os"
"os/signal"
"syscall"
)
var onlyOneSignalHandler = make(chan struct{})
var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM}
// SetupSignalHandler registered for SIGTERM and SIGINT. A stop channel is returned
// which is closed on one of these signals. If a second signal is caught, the program
// is terminated with exit code 1.
func SetupSignalHandler() (stopCh <-chan struct{}) {
close(onlyOneSignalHandler) // panics when called twice
stop := make(chan struct{})
c := make(chan os.Signal, 2)
signal.Notify(c, shutdownSignals...)
go func() {
<-c
close(stop)
<-c
os.Exit(1) // second signal. Exit directly.
}()
return stop
}