Files
Rainbond/gpushare-scheduler-extender/pkg/gpushare/controller.go
2025-08-25 16:04:00 +08:00

347 lines
10 KiB
Go

package gpushare
import (
"fmt"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
"golang.org/x/time/rate"
"time"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
clientgocache "k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/record"
)
var (
KeyFunc = clientgocache.DeletionHandlingMetaNamespaceKeyFunc
)
type Controller struct {
clientset *kubernetes.Clientset
// podLister can list/get pods from the shared informer's store.
podLister corelisters.PodLister
// nodeLister can list/get nodes from the shared informer's store.
nodeLister corelisters.NodeLister
// podQueue is a rate limited work queue. This is used to queue work to be
// processed instead of performing it as soon as a change happens. This
// means we can ensure we only process a fixed amount of resources at a
// time, and makes it easy to ensure we are never processing the same item
// simultaneously in two different workers.
podQueue workqueue.RateLimitingInterface
// recorder is an event recorder for recording Event resources to the
// Kubernetes API.
recorder record.EventRecorder
// podInformerSynced returns true if the pod store has been synced at least once.
podInformerSynced clientgocache.InformerSynced
// nodeInformerSynced returns true if the service store has been synced at least once.
nodeInformerSynced clientgocache.InformerSynced
schedulerCache *cache.SchedulerCache
// The cache to store the pod to be removed
removePodCache map[string]*v1.Pod
}
func NewController(clientset *kubernetes.Clientset, kubeInformerFactory kubeinformers.SharedInformerFactory, stopCh <-chan struct{}) (*Controller, error) {
log.V(100).Info("info: Creating event broadcaster")
eventBroadcaster := record.NewBroadcaster()
// eventBroadcaster.StartLogging(log.Infof)
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientset.CoreV1().Events("")})
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "gpushare-schd-extender"})
rateLimiter := workqueue.NewMaxOfRateLimiter(
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 1000*time.Second),
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(100), 500)},
)
c := &Controller{
clientset: clientset,
podQueue: workqueue.NewNamedRateLimitingQueue(rateLimiter, "podQueue"),
recorder: recorder,
removePodCache: map[string]*v1.Pod{},
}
// Create pod informer.
podInformer := kubeInformerFactory.Core().V1().Pods()
podInformer.Informer().AddEventHandler(clientgocache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
// log.V(100).Info("debug: added pod %s in ns %s", t.Name, t.Namespace)
return utils.IsGPUsharingPod(t)
case clientgocache.DeletedFinalStateUnknown:
if pod, ok := t.Obj.(*v1.Pod); ok {
log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
return utils.IsGPUsharingPod(pod)
}
runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
return false
default:
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
return false
}
},
Handler: clientgocache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToCache,
UpdateFunc: c.updatePodInCache,
DeleteFunc: c.deletePodFromCache,
},
})
c.podLister = podInformer.Lister()
c.podInformerSynced = podInformer.Informer().HasSynced
// Create node informer
nodeInformer := kubeInformerFactory.Core().V1().Nodes()
c.nodeLister = nodeInformer.Lister()
c.nodeInformerSynced = nodeInformer.Informer().HasSynced
// Create configMap informer
cmInformer := kubeInformerFactory.Core().V1().ConfigMaps()
cache.ConfigMapLister = cmInformer.Lister()
cache.ConfigMapInformerSynced = cmInformer.Informer().HasSynced
// Start informer goroutines.
go kubeInformerFactory.Start(stopCh)
// Create scheduler Cache
c.schedulerCache = cache.NewSchedulerCache(c.nodeLister, c.podLister)
log.V(100).Info("info: begin to wait for cache")
if ok := clientgocache.WaitForCacheSync(stopCh, c.nodeInformerSynced); !ok {
return nil, fmt.Errorf("failed to wait for node caches to sync")
} else {
log.V(100).Info("info: init the node cache successfully")
}
if ok := clientgocache.WaitForCacheSync(stopCh, c.podInformerSynced); !ok {
return nil, fmt.Errorf("failed to wait for pod caches to sync")
} else {
log.V(100).Info("info: init the pod cache successfully")
}
if ok := clientgocache.WaitForCacheSync(stopCh, cache.ConfigMapInformerSynced); !ok {
return nil, fmt.Errorf("failed to wait for configmap caches to sync")
} else {
log.V(100).Info("info: init the configmap cache successfully")
}
log.V(100).Info("info: end to wait for cache")
return c, nil
}
func (c *Controller) BuildCache() error {
return c.schedulerCache.BuildCache()
}
func (c *Controller) GetSchedulerCache() *cache.SchedulerCache {
return c.schedulerCache
}
// Run will set up the event handlers
func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
defer runtime.HandleCrash()
defer c.podQueue.ShutDown()
log.V(9).Info("info: Starting GPU Sharing Controller.")
log.V(9).Info("info: Waiting for informer caches to sync")
log.V(9).Info("info: Starting %v workers.", threadiness)
for i := 0; i < threadiness; i++ {
go wait.Until(c.runWorker, time.Second, stopCh)
}
log.V(3).Info("info: Started workers")
<-stopCh
log.V(3).Info("info: Shutting down workers")
return nil
}
// runWorker is a long-running function that will continually call the
// processNextWorkItem function in order to read and process a message on the
// workqueue.
func (c *Controller) runWorker() {
for c.processNextWorkItem() {
}
}
// syncPod will sync the pod with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (c *Controller) syncPod(key string) (forget bool, err error) {
ns, name, err := clientgocache.SplitMetaNamespaceKey(key)
log.V(9).Info("debug: begin to sync gpushare pod %s in ns %s", name, ns)
if err != nil {
return false, err
}
pod, err := c.podLister.Pods(ns).Get(name)
switch {
case errors.IsNotFound(err):
log.V(10).Info("debug: pod %s in ns %s has been deleted.", name, ns)
pod, found := c.removePodCache[key]
if found {
c.schedulerCache.RemovePod(pod)
delete(c.removePodCache, key)
}
case err != nil:
log.V(10).Info("warn: unable to retrieve pod %v from the store: %v", key, err)
default:
if utils.IsCompletePod(pod) {
log.V(10).Info("debug: pod %s in ns %s has completed.", name, ns)
c.schedulerCache.RemovePod(pod)
} else {
err := c.schedulerCache.AddOrUpdatePod(pod)
if err != nil {
return false, err
}
}
}
return true, nil
}
// processNextWorkItem will read a single work item off the podQueue and
// attempt to process it.
func (c *Controller) processNextWorkItem() bool {
log.V(100).Info("debug: begin processNextWorkItem()")
key, quit := c.podQueue.Get()
if quit {
return false
}
defer c.podQueue.Done(key)
defer log.V(100).Info("debug: end processNextWorkItem()")
forget, err := c.syncPod(key.(string))
if err == nil {
if forget {
c.podQueue.Forget(key)
}
return true
}
log.V(3).Info("Error syncing pods: %v", err)
runtime.HandleError(fmt.Errorf("Error syncing pod: %v", err))
c.podQueue.AddRateLimited(key)
return true
}
func (c *Controller) addPodToCache(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", obj)
return
}
// if !assignedNonTerminatedPod(t) {
// log.V(100).Info("debug: skip pod %s due to it's terminated.", pod.Name)
// return
// }
podKey, err := KeyFunc(pod)
if err != nil {
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
return
}
c.podQueue.Add(podKey)
// NOTE: Updating equivalence cache of addPodToCache has been
// handled optimistically in: pkg/scheduler/scheduler.go#assume()
}
func (c *Controller) updatePodInCache(oldObj, newObj interface{}) {
oldPod, ok := oldObj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert oldObj to *v1.Pod: %v", oldObj)
return
}
newPod, ok := newObj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert newObj to *v1.Pod: %v", newObj)
return
}
needUpdate := false
podUID := oldPod.UID
// 1. Need update when pod is turned to complete or failed
if c.schedulerCache.KnownPod(podUID) && utils.IsCompletePod(newPod) {
needUpdate = true
}
// 2. Need update when it's unknown pod, and GPU annotation has been set
if !c.schedulerCache.KnownPod(podUID) && utils.GetGPUIDFromAnnotation(newPod) >= 0 {
needUpdate = true
}
if needUpdate {
podKey, err := KeyFunc(newPod)
if err != nil {
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
return
}
log.V(3).Info("info: Need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
newPod.Name,
newPod.Namespace,
oldPod.Status.Phase,
newPod.Status.Phase,
oldPod.Annotations,
newPod.Annotations)
c.podQueue.Add(podKey)
} else {
log.V(100).Info("debug: No need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
newPod.Name,
newPod.Namespace,
oldPod.Status.Phase,
newPod.Status.Phase,
oldPod.Annotations,
newPod.Annotations)
}
return
}
func (c *Controller) deletePodFromCache(obj interface{}) {
var pod *v1.Pod
switch t := obj.(type) {
case *v1.Pod:
pod = t
case clientgocache.DeletedFinalStateUnknown:
var ok bool
pod, ok = t.Obj.(*v1.Pod)
if !ok {
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t.Obj)
return
}
default:
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t)
return
}
log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
podKey, err := KeyFunc(pod)
if err != nil {
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
return
}
c.podQueue.Add(podKey)
c.removePodCache[podKey] = pod
}