synchronization
This commit is contained in:
177
gpushare-scheduler-extender/pkg/cache/cache.go
vendored
Normal file
177
gpushare-scheduler-extender/pkg/cache/cache.go
vendored
Normal file
@@ -0,0 +1,177 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"sync"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
)
|
||||
|
||||
type SchedulerCache struct {
|
||||
|
||||
// a map from pod key to podState.
|
||||
nodes map[string]*NodeInfo
|
||||
|
||||
// nodeLister can list/get nodes from the shared informer's store.
|
||||
nodeLister corelisters.NodeLister
|
||||
|
||||
//
|
||||
podLister corelisters.PodLister
|
||||
|
||||
// record the knownPod, it will be added when annotation ALIYUN_GPU_ID is added, and will be removed when complete and deleted
|
||||
knownPods map[types.UID]*v1.Pod
|
||||
nLock *sync.RWMutex
|
||||
}
|
||||
|
||||
func NewSchedulerCache(nLister corelisters.NodeLister, pLister corelisters.PodLister) *SchedulerCache {
|
||||
return &SchedulerCache{
|
||||
nodes: make(map[string]*NodeInfo),
|
||||
nodeLister: nLister,
|
||||
podLister: pLister,
|
||||
knownPods: make(map[types.UID]*v1.Pod),
|
||||
nLock: new(sync.RWMutex),
|
||||
}
|
||||
}
|
||||
|
||||
func (cache *SchedulerCache) GetNodeinfos() []*NodeInfo {
|
||||
nodes := []*NodeInfo{}
|
||||
for _, n := range cache.nodes {
|
||||
nodes = append(nodes, n)
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
// build cache when initializing
|
||||
func (cache *SchedulerCache) BuildCache() error {
|
||||
log.V(5).Info("debug: begin to build scheduler cache")
|
||||
pods, err := cache.podLister.List(labels.Everything())
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
} else {
|
||||
for _, pod := range pods {
|
||||
if utils.GetGPUMemoryFromPodAnnotation(pod) <= uint(0) {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(pod.Spec.NodeName) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
err = cache.AddOrUpdatePod(pod)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (cache *SchedulerCache) GetPod(name, namespace string) (*v1.Pod, error) {
|
||||
return cache.podLister.Pods(namespace).Get(name)
|
||||
}
|
||||
|
||||
// Get known pod from the pod UID
|
||||
func (cache *SchedulerCache) KnownPod(podUID types.UID) bool {
|
||||
cache.nLock.RLock()
|
||||
defer cache.nLock.RUnlock()
|
||||
|
||||
_, found := cache.knownPods[podUID]
|
||||
return found
|
||||
}
|
||||
|
||||
func (cache *SchedulerCache) AddOrUpdatePod(pod *v1.Pod) error {
|
||||
log.V(100).Info("debug: Add or update pod info: %v", pod)
|
||||
log.V(100).Info("debug: Node %v", cache.nodes)
|
||||
if len(pod.Spec.NodeName) == 0 {
|
||||
log.V(100).Info("debug: pod %s in ns %s is not assigned to any node, skip", pod.Name, pod.Namespace)
|
||||
return nil
|
||||
}
|
||||
|
||||
n, err := cache.GetNodeInfo(pod.Spec.NodeName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
podCopy := pod.DeepCopy()
|
||||
if n.addOrUpdatePod(podCopy) {
|
||||
// put it into known pod
|
||||
cache.rememberPod(pod.UID, podCopy)
|
||||
} else {
|
||||
log.V(100).Info("debug: pod %s in ns %s's gpu id is %d, it's illegal, skip",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
utils.GetGPUIDFromAnnotation(pod))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// The lock is in cacheNode
|
||||
func (cache *SchedulerCache) RemovePod(pod *v1.Pod) {
|
||||
log.V(100).Info("debug: Remove pod info: %v", pod)
|
||||
log.V(100).Info("debug: Node %v", cache.nodes)
|
||||
n, err := cache.GetNodeInfo(pod.Spec.NodeName)
|
||||
if err == nil {
|
||||
n.removePod(pod)
|
||||
} else {
|
||||
log.V(10).Info("debug: Failed to get node %s due to %v", pod.Spec.NodeName, err)
|
||||
}
|
||||
|
||||
cache.forgetPod(pod.UID)
|
||||
}
|
||||
|
||||
// Get or build nodeInfo if it doesn't exist
|
||||
func (cache *SchedulerCache) GetNodeInfo(name string) (*NodeInfo, error) {
|
||||
node, err := cache.nodeLister.Get(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cache.nLock.Lock()
|
||||
defer cache.nLock.Unlock()
|
||||
n, ok := cache.nodes[name]
|
||||
|
||||
if !ok {
|
||||
n = NewNodeInfo(node)
|
||||
cache.nodes[name] = n
|
||||
} else {
|
||||
// if the existing node turn from non gpushare to gpushare
|
||||
// if (utils.GetTotalGPUMemory(n.node) <= 0 && utils.GetTotalGPUMemory(node) > 0) ||
|
||||
// (utils.GetGPUCountInNode(n.node) <= 0 && utils.GetGPUCountInNode(node) > 0) ||
|
||||
// // if the existing node turn from gpushare to non gpushare
|
||||
// (utils.GetTotalGPUMemory(n.node) > 0 && utils.GetTotalGPUMemory(node) <= 0) ||
|
||||
// (utils.GetGPUCountInNode(n.node) > 0 && utils.GetGPUCountInNode(node) <= 0) {
|
||||
if len(cache.nodes[name].devs) == 0 ||
|
||||
utils.GetTotalGPUMemory(n.node) <= 0 ||
|
||||
utils.GetGPUCountInNode(n.node) <= 0 {
|
||||
log.V(10).Info("info: GetNodeInfo() need update node %s",
|
||||
name)
|
||||
|
||||
// fix the scenario that the number of devices changes from 0 to an positive number
|
||||
cache.nodes[name].Reset(node)
|
||||
log.V(10).Info("info: node: %s, labels from cache after been updated: %v", n.node.Name, n.node.Labels)
|
||||
} else {
|
||||
log.V(10).Info("info: GetNodeInfo() uses the existing nodeInfo for %s", name)
|
||||
}
|
||||
log.V(100).Info("debug: node %s with devices %v", name, n.devs)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (cache *SchedulerCache) forgetPod(uid types.UID) {
|
||||
cache.nLock.Lock()
|
||||
defer cache.nLock.Unlock()
|
||||
delete(cache.knownPods, uid)
|
||||
}
|
||||
|
||||
func (cache *SchedulerCache) rememberPod(uid types.UID, pod *v1.Pod) {
|
||||
cache.nLock.Lock()
|
||||
defer cache.nLock.Unlock()
|
||||
cache.knownPods[pod.UID] = pod
|
||||
}
|
33
gpushare-scheduler-extender/pkg/cache/configmap.go
vendored
Normal file
33
gpushare-scheduler-extender/pkg/cache/configmap.go
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
clientgocache "k8s.io/client-go/tools/cache"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
var (
|
||||
ConfigMapLister corelisters.ConfigMapLister
|
||||
ConfigMapInformerSynced clientgocache.InformerSynced
|
||||
)
|
||||
|
||||
func getConfigMap(name string) *v1.ConfigMap {
|
||||
configMap, err := ConfigMapLister.ConfigMaps(metav1.NamespaceSystem).Get(name)
|
||||
|
||||
// If we can't get the configmap just return nil. The resync will eventually
|
||||
// sync things up.
|
||||
if err != nil {
|
||||
if !apierrors.IsNotFound(err) {
|
||||
log.V(10).Info("warn: find configmap with error: %v", err)
|
||||
utilruntime.HandleError(err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return configMap
|
||||
}
|
80
gpushare-scheduler-extender/pkg/cache/deviceinfo.go
vendored
Normal file
80
gpushare-scheduler-extender/pkg/cache/deviceinfo.go
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"sync"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
)
|
||||
|
||||
type DeviceInfo struct {
|
||||
idx int
|
||||
podMap map[types.UID]*v1.Pod
|
||||
// usedGPUMem uint
|
||||
totalGPUMem uint
|
||||
rwmu *sync.RWMutex
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) GetPods() []*v1.Pod {
|
||||
pods := []*v1.Pod{}
|
||||
for _, pod := range d.podMap {
|
||||
pods = append(pods, pod)
|
||||
}
|
||||
return pods
|
||||
}
|
||||
|
||||
func newDeviceInfo(index int, totalGPUMem uint) *DeviceInfo {
|
||||
return &DeviceInfo{
|
||||
idx: index,
|
||||
totalGPUMem: totalGPUMem,
|
||||
podMap: map[types.UID]*v1.Pod{},
|
||||
rwmu: new(sync.RWMutex),
|
||||
}
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) GetTotalGPUMemory() uint {
|
||||
return d.totalGPUMem
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) GetUsedGPUMemory() (gpuMem uint) {
|
||||
log.V(100).Info("debug: GetUsedGPUMemory() podMap %v, and its address is %p", d.podMap, d)
|
||||
d.rwmu.RLock()
|
||||
defer d.rwmu.RUnlock()
|
||||
for _, pod := range d.podMap {
|
||||
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
|
||||
log.V(100).Info("debug: skip the pod %s in ns %s due to its status is %s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
continue
|
||||
}
|
||||
// gpuMem += utils.GetGPUMemoryFromPodEnv(pod)
|
||||
gpuMem += utils.GetGPUMemoryFromPodAnnotation(pod)
|
||||
}
|
||||
return gpuMem
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) addPod(pod *v1.Pod) {
|
||||
log.V(100).Info("debug: dev.addPod() Pod %s in ns %s with the GPU ID %d will be added to device map",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
d.idx)
|
||||
d.rwmu.Lock()
|
||||
defer d.rwmu.Unlock()
|
||||
d.podMap[pod.UID] = pod
|
||||
log.V(100).Info("debug: dev.addPod() after updated is %v, and its address is %p",
|
||||
d.podMap,
|
||||
d)
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) removePod(pod *v1.Pod) {
|
||||
log.V(100).Info("debug: dev.removePod() Pod %s in ns %s with the GPU ID %d will be removed from device map",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
d.idx)
|
||||
d.rwmu.Lock()
|
||||
defer d.rwmu.Unlock()
|
||||
delete(d.podMap, pod.UID)
|
||||
log.V(100).Info("debug: dev.removePod() after updated is %v, and its address is %p",
|
||||
d.podMap,
|
||||
d)
|
||||
}
|
362
gpushare-scheduler-extender/pkg/cache/nodeinfo.go
vendored
Normal file
362
gpushare-scheduler-extender/pkg/cache/nodeinfo.go
vendored
Normal file
@@ -0,0 +1,362 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
const (
|
||||
OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
|
||||
)
|
||||
|
||||
// NodeInfo is node level aggregated information.
|
||||
type NodeInfo struct {
|
||||
ctx context.Context
|
||||
name string
|
||||
node *v1.Node
|
||||
devs map[int]*DeviceInfo
|
||||
gpuCount int
|
||||
gpuTotalMemory int
|
||||
rwmu *sync.RWMutex
|
||||
}
|
||||
|
||||
// Create Node Level
|
||||
func NewNodeInfo(node *v1.Node) *NodeInfo {
|
||||
log.V(10).Info("debug: NewNodeInfo() creates nodeInfo for %s", node.Name)
|
||||
|
||||
devMap := map[int]*DeviceInfo{}
|
||||
for i := 0; i < utils.GetGPUCountInNode(node); i++ {
|
||||
devMap[i] = newDeviceInfo(i, uint(utils.GetTotalGPUMemory(node)/utils.GetGPUCountInNode(node)))
|
||||
}
|
||||
|
||||
if len(devMap) == 0 {
|
||||
log.V(3).Info("warn: node %s with nodeinfo %v has no devices", node.Name, node)
|
||||
}
|
||||
|
||||
return &NodeInfo{
|
||||
ctx: context.Background(),
|
||||
name: node.Name,
|
||||
node: node,
|
||||
devs: devMap,
|
||||
gpuCount: utils.GetGPUCountInNode(node),
|
||||
gpuTotalMemory: utils.GetTotalGPUMemory(node),
|
||||
rwmu: new(sync.RWMutex),
|
||||
}
|
||||
}
|
||||
|
||||
// Only update the devices when the length of devs is 0
|
||||
func (n *NodeInfo) Reset(node *v1.Node) {
|
||||
n.gpuCount = utils.GetGPUCountInNode(node)
|
||||
n.gpuTotalMemory = utils.GetTotalGPUMemory(node)
|
||||
n.node = node
|
||||
if n.gpuCount == 0 {
|
||||
log.V(3).Info("warn: Reset for node %s but the gpu count is 0", node.Name)
|
||||
}
|
||||
|
||||
if n.gpuTotalMemory == 0 {
|
||||
log.V(3).Info("warn: Reset for node %s but the gpu total memory is 0", node.Name)
|
||||
}
|
||||
|
||||
if len(n.devs) == 0 && n.gpuCount > 0 {
|
||||
devMap := map[int]*DeviceInfo{}
|
||||
for i := 0; i < utils.GetGPUCountInNode(node); i++ {
|
||||
devMap[i] = newDeviceInfo(i, uint(n.gpuTotalMemory/n.gpuCount))
|
||||
}
|
||||
n.devs = devMap
|
||||
}
|
||||
log.V(3).Info("info: Reset() update nodeInfo for %s with devs %v", node.Name, n.devs)
|
||||
}
|
||||
|
||||
func (n *NodeInfo) GetName() string {
|
||||
return n.name
|
||||
}
|
||||
|
||||
func (n *NodeInfo) GetDevs() []*DeviceInfo {
|
||||
devs := make([]*DeviceInfo, n.gpuCount)
|
||||
for i, dev := range n.devs {
|
||||
devs[i] = dev
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func (n *NodeInfo) GetNode() *v1.Node {
|
||||
return n.node
|
||||
}
|
||||
|
||||
func (n *NodeInfo) GetTotalGPUMemory() int {
|
||||
return n.gpuTotalMemory
|
||||
}
|
||||
|
||||
func (n *NodeInfo) GetGPUCount() int {
|
||||
return n.gpuCount
|
||||
}
|
||||
|
||||
func (n *NodeInfo) removePod(pod *v1.Pod) {
|
||||
n.rwmu.Lock()
|
||||
defer n.rwmu.Unlock()
|
||||
|
||||
id := utils.GetGPUIDFromAnnotation(pod)
|
||||
if id >= 0 {
|
||||
dev, found := n.devs[id]
|
||||
if !found {
|
||||
log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
|
||||
} else {
|
||||
dev.removePod(pod)
|
||||
}
|
||||
} else {
|
||||
log.V(3).Info("warn: Pod %s in ns %s is not set the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
|
||||
}
|
||||
}
|
||||
|
||||
// Add the Pod which has the GPU id to the node
|
||||
func (n *NodeInfo) addOrUpdatePod(pod *v1.Pod) (added bool) {
|
||||
n.rwmu.Lock()
|
||||
defer n.rwmu.Unlock()
|
||||
|
||||
id := utils.GetGPUIDFromAnnotation(pod)
|
||||
log.V(3).Info("debug: addOrUpdatePod() Pod %s in ns %s with the GPU ID %d should be added to device map",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
id)
|
||||
if id >= 0 {
|
||||
dev, found := n.devs[id]
|
||||
if !found {
|
||||
log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
|
||||
} else {
|
||||
dev.addPod(pod)
|
||||
added = true
|
||||
}
|
||||
} else {
|
||||
log.V(3).Info("warn: Pod %s in ns %s is not set the GPU ID %d in node %s", pod.Name, pod.Namespace, id, n.name)
|
||||
}
|
||||
return added
|
||||
}
|
||||
|
||||
// check if the pod can be allocated on the node
|
||||
func (n *NodeInfo) Assume(pod *v1.Pod) (allocatable bool) {
|
||||
allocatable = false
|
||||
|
||||
n.rwmu.RLock()
|
||||
defer n.rwmu.RUnlock()
|
||||
|
||||
availableGPUs := n.getAvailableGPUs()
|
||||
reqGPU := uint(utils.GetGPUMemoryFromPodResource(pod))
|
||||
log.V(10).Info("debug: AvailableGPUs: %v in node %s", availableGPUs, n.name)
|
||||
|
||||
if len(availableGPUs) > 0 {
|
||||
for devID := 0; devID < len(n.devs); devID++ {
|
||||
availableGPU, ok := availableGPUs[devID]
|
||||
if ok {
|
||||
if availableGPU >= reqGPU {
|
||||
allocatable = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return allocatable
|
||||
|
||||
}
|
||||
|
||||
func (n *NodeInfo) Allocate(clientset *kubernetes.Clientset, pod *v1.Pod) (err error) {
|
||||
var newPod *v1.Pod
|
||||
n.rwmu.Lock()
|
||||
defer n.rwmu.Unlock()
|
||||
log.V(3).Info("info: Allocate() ----Begin to allocate GPU for gpu mem for pod %s in ns %s----", pod.Name, pod.Namespace)
|
||||
// 1. Update the pod spec
|
||||
devId, found := n.allocateGPUID(pod)
|
||||
if found {
|
||||
log.V(3).Info("info: Allocate() 1. Allocate GPU ID %d to pod %s in ns %s.----", devId, pod.Name, pod.Namespace)
|
||||
// newPod := utils.GetUpdatedPodEnvSpec(pod, devId, nodeInfo.GetTotalGPUMemory()/nodeInfo.GetGPUCount())
|
||||
//newPod = utils.GetUpdatedPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
|
||||
patchedAnnotationBytes, err := utils.PatchPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate patched annotations,reason: %v", err)
|
||||
}
|
||||
newPod, err = clientset.CoreV1().Pods(pod.Namespace).Patch(n.ctx, pod.Name, types.StrategicMergePatchType, patchedAnnotationBytes, metav1.PatchOptions{})
|
||||
//_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
|
||||
if err != nil {
|
||||
// the object has been modified; please apply your changes to the latest version and try again
|
||||
if err.Error() == OptimisticLockErrorMsg {
|
||||
// retry
|
||||
pod, err = clientset.CoreV1().Pods(pod.Namespace).Get(n.ctx, pod.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// newPod = utils.GetUpdatedPodEnvSpec(pod, devId, nodeInfo.GetTotalGPUMemory()/nodeInfo.GetGPUCount())
|
||||
//newPod = utils.GetUpdatedPodAnnotationSpec(pod, devId, n.GetTotalGPUMemory()/n.GetGPUCount())
|
||||
//_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
|
||||
newPod, err = clientset.CoreV1().Pods(pod.Namespace).Patch(n.ctx, pod.Name, types.StrategicMergePatchType, patchedAnnotationBytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
log.V(3).Info("failed to patch pod %v", pod)
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
err = fmt.Errorf("The node %s can't place the pod %s in ns %s,and the pod spec is %v", pod.Spec.NodeName, pod.Name, pod.Namespace, pod)
|
||||
}
|
||||
|
||||
// 2. Bind the pod to the node
|
||||
if err == nil {
|
||||
binding := &v1.Binding{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: pod.Name, UID: pod.UID},
|
||||
Target: v1.ObjectReference{Kind: "Node", Name: n.name},
|
||||
}
|
||||
log.V(3).Info("info: Allocate() 2. Try to bind pod %s in %s namespace to node %s with %v",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
pod.Spec.NodeName,
|
||||
binding)
|
||||
err = clientset.CoreV1().Pods(pod.Namespace).Bind(n.ctx, binding, metav1.CreateOptions{})
|
||||
if err != nil {
|
||||
log.V(3).Info("warn: Failed to bind the pod %s in ns %s due to %v", pod.Name, pod.Namespace, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// 3. update the device info if the pod is update successfully
|
||||
if err == nil {
|
||||
log.V(3).Info("info: Allocate() 3. Try to add pod %s in ns %s to dev %d",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
devId)
|
||||
dev, found := n.devs[devId]
|
||||
if !found {
|
||||
log.V(3).Info("warn: Pod %s in ns %s failed to find the GPU ID %d in node %s", pod.Name, pod.Namespace, devId, n.name)
|
||||
} else {
|
||||
dev.addPod(newPod)
|
||||
}
|
||||
}
|
||||
log.V(3).Info("info: Allocate() ----End to allocate GPU for gpu mem for pod %s in ns %s----", pod.Name, pod.Namespace)
|
||||
return err
|
||||
}
|
||||
|
||||
// allocate the GPU ID to the pod
|
||||
func (n *NodeInfo) allocateGPUID(pod *v1.Pod) (candidateDevID int, found bool) {
|
||||
|
||||
reqGPU := uint(0)
|
||||
found = false
|
||||
candidateDevID = -1
|
||||
candidateGPUMemory := uint(0)
|
||||
availableGPUs := n.getAvailableGPUs()
|
||||
|
||||
reqGPU = uint(utils.GetGPUMemoryFromPodResource(pod))
|
||||
|
||||
if reqGPU > uint(0) {
|
||||
log.V(3).Info("info: reqGPU for pod %s in ns %s: %d", pod.Name, pod.Namespace, reqGPU)
|
||||
log.V(3).Info("info: AvailableGPUs: %v in node %s", availableGPUs, n.name)
|
||||
if len(availableGPUs) > 0 {
|
||||
for devID := 0; devID < len(n.devs); devID++ {
|
||||
availableGPU, ok := availableGPUs[devID]
|
||||
if ok {
|
||||
if availableGPU >= reqGPU {
|
||||
if candidateDevID == -1 || candidateGPUMemory > availableGPU {
|
||||
candidateDevID = devID
|
||||
candidateGPUMemory = availableGPU
|
||||
}
|
||||
|
||||
found = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
log.V(3).Info("info: Find candidate dev id %d for pod %s in ns %s successfully.",
|
||||
candidateDevID,
|
||||
pod.Name,
|
||||
pod.Namespace)
|
||||
} else {
|
||||
log.V(3).Info("warn: Failed to find available GPUs %d for the pod %s in the namespace %s",
|
||||
reqGPU,
|
||||
pod.Name,
|
||||
pod.Namespace)
|
||||
}
|
||||
}
|
||||
|
||||
return candidateDevID, found
|
||||
}
|
||||
|
||||
func (n *NodeInfo) getAvailableGPUs() (availableGPUs map[int]uint) {
|
||||
allGPUs := n.getAllGPUs()
|
||||
usedGPUs := n.getUsedGPUs()
|
||||
unhealthyGPUs := n.getUnhealthyGPUs()
|
||||
availableGPUs = map[int]uint{}
|
||||
for id, totalGPUMem := range allGPUs {
|
||||
if usedGPUMem, found := usedGPUs[id]; found {
|
||||
availableGPUs[id] = totalGPUMem - usedGPUMem
|
||||
}
|
||||
}
|
||||
log.V(3).Info("info: available GPU list %v before removing unhealty GPUs", availableGPUs)
|
||||
for id, _ := range unhealthyGPUs {
|
||||
log.V(3).Info("info: delete dev %d from availble GPU list", id)
|
||||
delete(availableGPUs, id)
|
||||
}
|
||||
log.V(3).Info("info: available GPU list %v after removing unhealty GPUs", availableGPUs)
|
||||
|
||||
return availableGPUs
|
||||
}
|
||||
|
||||
// device index: gpu memory
|
||||
func (n *NodeInfo) getUsedGPUs() (usedGPUs map[int]uint) {
|
||||
usedGPUs = map[int]uint{}
|
||||
for _, dev := range n.devs {
|
||||
usedGPUs[dev.idx] = dev.GetUsedGPUMemory()
|
||||
}
|
||||
log.V(3).Info("info: getUsedGPUs: %v in node %s, and devs %v", usedGPUs, n.name, n.devs)
|
||||
return usedGPUs
|
||||
}
|
||||
|
||||
// device index: gpu memory
|
||||
func (n *NodeInfo) getAllGPUs() (allGPUs map[int]uint) {
|
||||
allGPUs = map[int]uint{}
|
||||
for _, dev := range n.devs {
|
||||
allGPUs[dev.idx] = dev.totalGPUMem
|
||||
}
|
||||
log.V(3).Info("info: getAllGPUs: %v in node %s, and dev %v", allGPUs, n.name, n.devs)
|
||||
return allGPUs
|
||||
}
|
||||
|
||||
// getUnhealthyGPUs get the unhealthy GPUs from configmap
|
||||
func (n *NodeInfo) getUnhealthyGPUs() (unhealthyGPUs map[int]bool) {
|
||||
unhealthyGPUs = map[int]bool{}
|
||||
name := fmt.Sprintf("unhealthy-gpu-%s", n.GetName())
|
||||
log.V(3).Info("info: try to find unhealthy node %s", name)
|
||||
cm := getConfigMap(name)
|
||||
if cm == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if devicesStr, found := cm.Data["gpus"]; found {
|
||||
log.V(3).Info("warn: the unhelathy gpus %s", devicesStr)
|
||||
idsStr := strings.Split(devicesStr, ",")
|
||||
for _, sid := range idsStr {
|
||||
id, err := strconv.Atoi(sid)
|
||||
if err != nil {
|
||||
log.V(3).Info("warn: failed to parse id %s due to %v", sid, err)
|
||||
}
|
||||
unhealthyGPUs[id] = true
|
||||
}
|
||||
} else {
|
||||
log.V(3).Info("info: skip, because there are no unhealthy gpus")
|
||||
}
|
||||
|
||||
return
|
||||
|
||||
}
|
346
gpushare-scheduler-extender/pkg/gpushare/controller.go
Normal file
346
gpushare-scheduler-extender/pkg/gpushare/controller.go
Normal file
@@ -0,0 +1,346 @@
|
||||
package gpushare
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"golang.org/x/time/rate"
|
||||
"time"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/util/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
kubeinformers "k8s.io/client-go/informers"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
clientgocache "k8s.io/client-go/tools/cache"
|
||||
"k8s.io/client-go/util/workqueue"
|
||||
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
)
|
||||
|
||||
var (
|
||||
KeyFunc = clientgocache.DeletionHandlingMetaNamespaceKeyFunc
|
||||
)
|
||||
|
||||
type Controller struct {
|
||||
clientset *kubernetes.Clientset
|
||||
|
||||
// podLister can list/get pods from the shared informer's store.
|
||||
podLister corelisters.PodLister
|
||||
|
||||
// nodeLister can list/get nodes from the shared informer's store.
|
||||
nodeLister corelisters.NodeLister
|
||||
|
||||
// podQueue is a rate limited work queue. This is used to queue work to be
|
||||
// processed instead of performing it as soon as a change happens. This
|
||||
// means we can ensure we only process a fixed amount of resources at a
|
||||
// time, and makes it easy to ensure we are never processing the same item
|
||||
// simultaneously in two different workers.
|
||||
podQueue workqueue.RateLimitingInterface
|
||||
|
||||
// recorder is an event recorder for recording Event resources to the
|
||||
// Kubernetes API.
|
||||
recorder record.EventRecorder
|
||||
|
||||
// podInformerSynced returns true if the pod store has been synced at least once.
|
||||
podInformerSynced clientgocache.InformerSynced
|
||||
|
||||
// nodeInformerSynced returns true if the service store has been synced at least once.
|
||||
nodeInformerSynced clientgocache.InformerSynced
|
||||
|
||||
schedulerCache *cache.SchedulerCache
|
||||
|
||||
// The cache to store the pod to be removed
|
||||
removePodCache map[string]*v1.Pod
|
||||
}
|
||||
|
||||
func NewController(clientset *kubernetes.Clientset, kubeInformerFactory kubeinformers.SharedInformerFactory, stopCh <-chan struct{}) (*Controller, error) {
|
||||
log.V(100).Info("info: Creating event broadcaster")
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
// eventBroadcaster.StartLogging(log.Infof)
|
||||
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientset.CoreV1().Events("")})
|
||||
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "gpushare-schd-extender"})
|
||||
|
||||
rateLimiter := workqueue.NewMaxOfRateLimiter(
|
||||
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 1000*time.Second),
|
||||
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(100), 500)},
|
||||
)
|
||||
c := &Controller{
|
||||
clientset: clientset,
|
||||
podQueue: workqueue.NewNamedRateLimitingQueue(rateLimiter, "podQueue"),
|
||||
recorder: recorder,
|
||||
removePodCache: map[string]*v1.Pod{},
|
||||
}
|
||||
// Create pod informer.
|
||||
podInformer := kubeInformerFactory.Core().V1().Pods()
|
||||
podInformer.Informer().AddEventHandler(clientgocache.FilteringResourceEventHandler{
|
||||
FilterFunc: func(obj interface{}) bool {
|
||||
switch t := obj.(type) {
|
||||
case *v1.Pod:
|
||||
// log.V(100).Info("debug: added pod %s in ns %s", t.Name, t.Namespace)
|
||||
return utils.IsGPUsharingPod(t)
|
||||
case clientgocache.DeletedFinalStateUnknown:
|
||||
if pod, ok := t.Obj.(*v1.Pod); ok {
|
||||
log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
|
||||
return utils.IsGPUsharingPod(pod)
|
||||
}
|
||||
runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
|
||||
return false
|
||||
default:
|
||||
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
|
||||
return false
|
||||
}
|
||||
},
|
||||
Handler: clientgocache.ResourceEventHandlerFuncs{
|
||||
AddFunc: c.addPodToCache,
|
||||
UpdateFunc: c.updatePodInCache,
|
||||
DeleteFunc: c.deletePodFromCache,
|
||||
},
|
||||
})
|
||||
|
||||
c.podLister = podInformer.Lister()
|
||||
c.podInformerSynced = podInformer.Informer().HasSynced
|
||||
|
||||
// Create node informer
|
||||
nodeInformer := kubeInformerFactory.Core().V1().Nodes()
|
||||
c.nodeLister = nodeInformer.Lister()
|
||||
c.nodeInformerSynced = nodeInformer.Informer().HasSynced
|
||||
|
||||
// Create configMap informer
|
||||
cmInformer := kubeInformerFactory.Core().V1().ConfigMaps()
|
||||
cache.ConfigMapLister = cmInformer.Lister()
|
||||
cache.ConfigMapInformerSynced = cmInformer.Informer().HasSynced
|
||||
|
||||
// Start informer goroutines.
|
||||
go kubeInformerFactory.Start(stopCh)
|
||||
|
||||
// Create scheduler Cache
|
||||
c.schedulerCache = cache.NewSchedulerCache(c.nodeLister, c.podLister)
|
||||
|
||||
log.V(100).Info("info: begin to wait for cache")
|
||||
|
||||
if ok := clientgocache.WaitForCacheSync(stopCh, c.nodeInformerSynced); !ok {
|
||||
return nil, fmt.Errorf("failed to wait for node caches to sync")
|
||||
} else {
|
||||
log.V(100).Info("info: init the node cache successfully")
|
||||
}
|
||||
|
||||
if ok := clientgocache.WaitForCacheSync(stopCh, c.podInformerSynced); !ok {
|
||||
return nil, fmt.Errorf("failed to wait for pod caches to sync")
|
||||
} else {
|
||||
log.V(100).Info("info: init the pod cache successfully")
|
||||
}
|
||||
|
||||
if ok := clientgocache.WaitForCacheSync(stopCh, cache.ConfigMapInformerSynced); !ok {
|
||||
return nil, fmt.Errorf("failed to wait for configmap caches to sync")
|
||||
} else {
|
||||
log.V(100).Info("info: init the configmap cache successfully")
|
||||
}
|
||||
|
||||
log.V(100).Info("info: end to wait for cache")
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
func (c *Controller) BuildCache() error {
|
||||
return c.schedulerCache.BuildCache()
|
||||
}
|
||||
|
||||
func (c *Controller) GetSchedulerCache() *cache.SchedulerCache {
|
||||
return c.schedulerCache
|
||||
}
|
||||
|
||||
// Run will set up the event handlers
|
||||
func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
|
||||
defer runtime.HandleCrash()
|
||||
defer c.podQueue.ShutDown()
|
||||
|
||||
log.V(9).Info("info: Starting GPU Sharing Controller.")
|
||||
log.V(9).Info("info: Waiting for informer caches to sync")
|
||||
|
||||
log.V(9).Info("info: Starting %v workers.", threadiness)
|
||||
for i := 0; i < threadiness; i++ {
|
||||
go wait.Until(c.runWorker, time.Second, stopCh)
|
||||
}
|
||||
|
||||
log.V(3).Info("info: Started workers")
|
||||
<-stopCh
|
||||
log.V(3).Info("info: Shutting down workers")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// runWorker is a long-running function that will continually call the
|
||||
// processNextWorkItem function in order to read and process a message on the
|
||||
// workqueue.
|
||||
func (c *Controller) runWorker() {
|
||||
for c.processNextWorkItem() {
|
||||
}
|
||||
}
|
||||
|
||||
// syncPod will sync the pod with the given key if it has had its expectations fulfilled,
|
||||
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
|
||||
// invoked concurrently with the same key.
|
||||
func (c *Controller) syncPod(key string) (forget bool, err error) {
|
||||
ns, name, err := clientgocache.SplitMetaNamespaceKey(key)
|
||||
log.V(9).Info("debug: begin to sync gpushare pod %s in ns %s", name, ns)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
pod, err := c.podLister.Pods(ns).Get(name)
|
||||
switch {
|
||||
case errors.IsNotFound(err):
|
||||
log.V(10).Info("debug: pod %s in ns %s has been deleted.", name, ns)
|
||||
pod, found := c.removePodCache[key]
|
||||
if found {
|
||||
c.schedulerCache.RemovePod(pod)
|
||||
delete(c.removePodCache, key)
|
||||
}
|
||||
case err != nil:
|
||||
log.V(10).Info("warn: unable to retrieve pod %v from the store: %v", key, err)
|
||||
default:
|
||||
if utils.IsCompletePod(pod) {
|
||||
log.V(10).Info("debug: pod %s in ns %s has completed.", name, ns)
|
||||
c.schedulerCache.RemovePod(pod)
|
||||
} else {
|
||||
err := c.schedulerCache.AddOrUpdatePod(pod)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// processNextWorkItem will read a single work item off the podQueue and
|
||||
// attempt to process it.
|
||||
func (c *Controller) processNextWorkItem() bool {
|
||||
log.V(100).Info("debug: begin processNextWorkItem()")
|
||||
key, quit := c.podQueue.Get()
|
||||
if quit {
|
||||
return false
|
||||
}
|
||||
defer c.podQueue.Done(key)
|
||||
defer log.V(100).Info("debug: end processNextWorkItem()")
|
||||
forget, err := c.syncPod(key.(string))
|
||||
if err == nil {
|
||||
if forget {
|
||||
c.podQueue.Forget(key)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
log.V(3).Info("Error syncing pods: %v", err)
|
||||
runtime.HandleError(fmt.Errorf("Error syncing pod: %v", err))
|
||||
c.podQueue.AddRateLimited(key)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *Controller) addPodToCache(obj interface{}) {
|
||||
pod, ok := obj.(*v1.Pod)
|
||||
if !ok {
|
||||
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", obj)
|
||||
return
|
||||
}
|
||||
|
||||
// if !assignedNonTerminatedPod(t) {
|
||||
// log.V(100).Info("debug: skip pod %s due to it's terminated.", pod.Name)
|
||||
// return
|
||||
// }
|
||||
|
||||
podKey, err := KeyFunc(pod)
|
||||
if err != nil {
|
||||
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
c.podQueue.Add(podKey)
|
||||
|
||||
// NOTE: Updating equivalence cache of addPodToCache has been
|
||||
// handled optimistically in: pkg/scheduler/scheduler.go#assume()
|
||||
}
|
||||
|
||||
func (c *Controller) updatePodInCache(oldObj, newObj interface{}) {
|
||||
oldPod, ok := oldObj.(*v1.Pod)
|
||||
if !ok {
|
||||
log.V(3).Info("warn: cannot convert oldObj to *v1.Pod: %v", oldObj)
|
||||
return
|
||||
}
|
||||
newPod, ok := newObj.(*v1.Pod)
|
||||
if !ok {
|
||||
log.V(3).Info("warn: cannot convert newObj to *v1.Pod: %v", newObj)
|
||||
return
|
||||
}
|
||||
needUpdate := false
|
||||
|
||||
podUID := oldPod.UID
|
||||
|
||||
// 1. Need update when pod is turned to complete or failed
|
||||
if c.schedulerCache.KnownPod(podUID) && utils.IsCompletePod(newPod) {
|
||||
needUpdate = true
|
||||
}
|
||||
// 2. Need update when it's unknown pod, and GPU annotation has been set
|
||||
if !c.schedulerCache.KnownPod(podUID) && utils.GetGPUIDFromAnnotation(newPod) >= 0 {
|
||||
needUpdate = true
|
||||
}
|
||||
if needUpdate {
|
||||
podKey, err := KeyFunc(newPod)
|
||||
if err != nil {
|
||||
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
|
||||
return
|
||||
}
|
||||
log.V(3).Info("info: Need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
|
||||
newPod.Name,
|
||||
newPod.Namespace,
|
||||
oldPod.Status.Phase,
|
||||
newPod.Status.Phase,
|
||||
oldPod.Annotations,
|
||||
newPod.Annotations)
|
||||
c.podQueue.Add(podKey)
|
||||
} else {
|
||||
log.V(100).Info("debug: No need to update pod name %s in ns %s and old status is %v, new status is %v; its old annotation %v and new annotation %v",
|
||||
newPod.Name,
|
||||
newPod.Namespace,
|
||||
oldPod.Status.Phase,
|
||||
newPod.Status.Phase,
|
||||
oldPod.Annotations,
|
||||
newPod.Annotations)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (c *Controller) deletePodFromCache(obj interface{}) {
|
||||
var pod *v1.Pod
|
||||
switch t := obj.(type) {
|
||||
case *v1.Pod:
|
||||
pod = t
|
||||
case clientgocache.DeletedFinalStateUnknown:
|
||||
var ok bool
|
||||
pod, ok = t.Obj.(*v1.Pod)
|
||||
if !ok {
|
||||
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t.Obj)
|
||||
return
|
||||
}
|
||||
default:
|
||||
log.V(3).Info("warn: cannot convert to *v1.Pod: %v", t)
|
||||
return
|
||||
}
|
||||
|
||||
log.V(100).Info("debug: delete pod %s in ns %s", pod.Name, pod.Namespace)
|
||||
podKey, err := KeyFunc(pod)
|
||||
if err != nil {
|
||||
log.V(3).Info("warn: Failed to get the jobkey: %v", err)
|
||||
return
|
||||
}
|
||||
c.podQueue.Add(podKey)
|
||||
c.removePodCache[podKey] = pod
|
||||
}
|
70
gpushare-scheduler-extender/pkg/log/level.go
Normal file
70
gpushare-scheduler-extender/pkg/log/level.go
Normal file
@@ -0,0 +1,70 @@
|
||||
package log
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"go.uber.org/zap/zapcore"
|
||||
)
|
||||
|
||||
type levelLogger struct {
|
||||
level *int32
|
||||
mu sync.Mutex
|
||||
log *zap.Logger
|
||||
}
|
||||
|
||||
type verbose bool
|
||||
|
||||
var l *levelLogger
|
||||
|
||||
func NewLoggerWithLevel(level int32, option ...zap.Option) {
|
||||
cfg := zap.NewProductionEncoderConfig()
|
||||
cfg.EncodeTime = zapcore.ISO8601TimeEncoder
|
||||
|
||||
core := zapcore.NewCore(
|
||||
zapcore.NewJSONEncoder(cfg),
|
||||
zapcore.Lock(os.Stdout),
|
||||
zap.NewAtomicLevel(),
|
||||
)
|
||||
|
||||
if option == nil {
|
||||
option = []zap.Option{}
|
||||
}
|
||||
option = append(option, zap.AddCaller(), zap.AddCallerSkip(1))
|
||||
l = &levelLogger{
|
||||
level: &level,
|
||||
mu: sync.Mutex{},
|
||||
log: zap.New(core, option...),
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
V for log level, normal usage example
|
||||
globalLogger default level 3, debug level 10
|
||||
example level:
|
||||
|
||||
api request 4
|
||||
api response 9
|
||||
|
||||
services func 5
|
||||
|
||||
db error 9
|
||||
db query 11
|
||||
db result 15
|
||||
*/
|
||||
func V(level int32) verbose {
|
||||
return level < *l.level
|
||||
}
|
||||
|
||||
func (v verbose) Info(format string, args ...interface{}) {
|
||||
if v {
|
||||
l.log.Info(fmt.Sprintf(format, args...))
|
||||
}
|
||||
}
|
||||
|
||||
func Fatal(format string, args ...interface{}) {
|
||||
l.log.Fatal(fmt.Sprintf(format, args...))
|
||||
|
||||
}
|
64
gpushare-scheduler-extender/pkg/routes/pprof.go
Normal file
64
gpushare-scheduler-extender/pkg/routes/pprof.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package routes
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/pprof"
|
||||
|
||||
"github.com/julienschmidt/httprouter"
|
||||
)
|
||||
|
||||
func AddPProf(r *httprouter.Router) {
|
||||
r.GET("/debug/pprof/", index)
|
||||
r.GET("/debug/pprof/cmdline/", cmdline)
|
||||
r.GET("/debug/pprof/profile/", profile)
|
||||
r.GET("/debug/pprof/symbol/", symbol)
|
||||
r.GET("/debug/pprof/trace/", trace)
|
||||
|
||||
r.GET("/debug/pprof/heap/", heap)
|
||||
r.GET("/debug/pprof/goroutine/", goroutine)
|
||||
r.GET("/debug/pprof/block/", block)
|
||||
r.GET("/debug/pprof/threadcreate/", threadcreate)
|
||||
r.GET("/debug/pprof/mutex/", mutex)
|
||||
}
|
||||
|
||||
// profiling tools handlers
|
||||
|
||||
func index(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Index(w, r)
|
||||
}
|
||||
|
||||
func cmdline(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Cmdline(w, r)
|
||||
}
|
||||
|
||||
func profile(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Profile(w, r)
|
||||
}
|
||||
|
||||
func symbol(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Symbol(w, r)
|
||||
}
|
||||
|
||||
func trace(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Trace(w, r)
|
||||
}
|
||||
|
||||
func heap(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Handler("heap").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func goroutine(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Handler("goroutine").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func block(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Handler("block").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func threadcreate(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Handler("threadcreate").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func mutex(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
pprof.Handler("mutex").ServeHTTP(w, r)
|
||||
}
|
181
gpushare-scheduler-extender/pkg/routes/routes.go
Normal file
181
gpushare-scheduler-extender/pkg/routes/routes.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package routes
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/julienschmidt/httprouter"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/scheduler"
|
||||
|
||||
schedulerapi "k8s.io/kube-scheduler/extender/v1"
|
||||
)
|
||||
|
||||
const (
|
||||
versionPath = "/version"
|
||||
apiPrefix = "/gpushare-scheduler"
|
||||
bindPrefix = apiPrefix + "/bind"
|
||||
predicatesPrefix = apiPrefix + "/filter"
|
||||
inspectPrefix = apiPrefix + "/inspect/:nodename"
|
||||
inspectListPrefix = apiPrefix + "/inspect"
|
||||
)
|
||||
|
||||
var (
|
||||
version = "0.1.0"
|
||||
// mu sync.RWMutex
|
||||
)
|
||||
|
||||
func checkBody(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Body == nil {
|
||||
http.Error(w, "Please send a request body", 400)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func InspectRoute(inspect *scheduler.Inspect) httprouter.Handle {
|
||||
return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
|
||||
result := inspect.Handler(ps.ByName("nodename"))
|
||||
|
||||
if resultBody, err := json.Marshal(result); err != nil {
|
||||
// panic(err)
|
||||
log.V(3).Info("warn: Failed due to %v", err)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
|
||||
w.Write([]byte(errMsg))
|
||||
} else {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(resultBody)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func PredicateRoute(predicate *scheduler.Predicate) httprouter.Handle {
|
||||
return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
checkBody(w, r)
|
||||
|
||||
// mu.RLock()
|
||||
// defer mu.RUnlock()
|
||||
|
||||
var buf bytes.Buffer
|
||||
body := io.TeeReader(r.Body, &buf)
|
||||
|
||||
var extenderArgs schedulerapi.ExtenderArgs
|
||||
var extenderFilterResult *schedulerapi.ExtenderFilterResult
|
||||
|
||||
if err := json.NewDecoder(body).Decode(&extenderArgs); err != nil {
|
||||
log.V(3).Info("warn: failed to parse request due to error %v", err)
|
||||
extenderFilterResult = &schedulerapi.ExtenderFilterResult{
|
||||
Nodes: nil,
|
||||
FailedNodes: nil,
|
||||
Error: err.Error(),
|
||||
}
|
||||
} else {
|
||||
log.V(90).Info("debug: gpusharingfilter ExtenderArgs =%v", extenderArgs)
|
||||
extenderFilterResult = predicate.Handler(&extenderArgs)
|
||||
}
|
||||
|
||||
if resultBody, err := json.Marshal(extenderFilterResult); err != nil {
|
||||
// panic(err)
|
||||
log.V(3).Info("warn: Failed due to %v", err)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
|
||||
w.Write([]byte(errMsg))
|
||||
} else {
|
||||
log.V(100).Info("predicate: %s, extenderFilterResult = %s ", predicate.Name, resultBody)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(resultBody)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BindRoute(bind *scheduler.Bind) httprouter.Handle {
|
||||
return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
checkBody(w, r)
|
||||
|
||||
// mu.Lock()
|
||||
// defer mu.Unlock()
|
||||
var buf bytes.Buffer
|
||||
body := io.TeeReader(r.Body, &buf)
|
||||
|
||||
var extenderBindingArgs schedulerapi.ExtenderBindingArgs
|
||||
var extenderBindingResult *schedulerapi.ExtenderBindingResult
|
||||
failed := false
|
||||
|
||||
if err := json.NewDecoder(body).Decode(&extenderBindingArgs); err != nil {
|
||||
extenderBindingResult = &schedulerapi.ExtenderBindingResult{
|
||||
Error: err.Error(),
|
||||
}
|
||||
failed = true
|
||||
} else {
|
||||
log.V(10).Info("debug: gpusharingBind ExtenderArgs =%v", extenderBindingArgs)
|
||||
extenderBindingResult = bind.Handler(extenderBindingArgs)
|
||||
}
|
||||
|
||||
if len(extenderBindingResult.Error) > 0 {
|
||||
failed = true
|
||||
}
|
||||
|
||||
if resultBody, err := json.Marshal(extenderBindingResult); err != nil {
|
||||
log.V(3).Info("warn: Failed due to %v", err)
|
||||
// panic(err)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
errMsg := fmt.Sprintf("{'error':'%s'}", err.Error())
|
||||
w.Write([]byte(errMsg))
|
||||
} else {
|
||||
log.V(3).Info("info: extenderBindingResult = %s", resultBody)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if failed {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
} else {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
w.Write(resultBody)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func VersionRoute(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
fmt.Fprint(w, fmt.Sprint(version))
|
||||
}
|
||||
|
||||
func AddVersion(router *httprouter.Router) {
|
||||
router.GET(versionPath, DebugLogging(VersionRoute, versionPath))
|
||||
}
|
||||
|
||||
func DebugLogging(h httprouter.Handle, path string) httprouter.Handle {
|
||||
return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
|
||||
log.V(90).Info("path: %s, request body = %s", path, r.Body)
|
||||
startTime := time.Now()
|
||||
h(w, r, p)
|
||||
log.V(90).Info("path: %s, response: %v, cost_time: %v", path, w, time.Now().Sub(startTime))
|
||||
}
|
||||
}
|
||||
|
||||
func AddPredicate(router *httprouter.Router, predicate *scheduler.Predicate) {
|
||||
// path := predicatesPrefix + "/" + predicate.Name
|
||||
router.POST(predicatesPrefix, DebugLogging(PredicateRoute(predicate), predicatesPrefix))
|
||||
}
|
||||
|
||||
func AddBind(router *httprouter.Router, bind *scheduler.Bind) {
|
||||
if handle, _, _ := router.Lookup("POST", bindPrefix); handle != nil {
|
||||
log.V(3).Info("warning: AddBind was called more then once!")
|
||||
} else {
|
||||
router.POST(bindPrefix, DebugLogging(BindRoute(bind), bindPrefix))
|
||||
}
|
||||
}
|
||||
|
||||
func AddInspect(router *httprouter.Router, inspect *scheduler.Inspect) {
|
||||
router.GET(inspectPrefix, DebugLogging(InspectRoute(inspect), inspectPrefix))
|
||||
router.GET(inspectListPrefix, DebugLogging(InspectRoute(inspect), inspectListPrefix))
|
||||
}
|
26
gpushare-scheduler-extender/pkg/scheduler/bind.go
Normal file
26
gpushare-scheduler-extender/pkg/scheduler/bind.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
schedulerapi "k8s.io/kube-scheduler/extender/v1"
|
||||
)
|
||||
|
||||
// Bind is responsible for binding node and pod
|
||||
type Bind struct {
|
||||
Name string
|
||||
Func func(podName string, podNamespace string, podUID types.UID, node string, cache *cache.SchedulerCache) error
|
||||
cache *cache.SchedulerCache
|
||||
}
|
||||
|
||||
// Handler handles the Bind request
|
||||
func (b Bind) Handler(args schedulerapi.ExtenderBindingArgs) *schedulerapi.ExtenderBindingResult {
|
||||
err := b.Func(args.PodName, args.PodNamespace, args.PodUID, args.Node, b.cache)
|
||||
errMsg := ""
|
||||
if err != nil {
|
||||
errMsg = err.Error()
|
||||
}
|
||||
return &schedulerapi.ExtenderBindingResult{
|
||||
Error: errMsg,
|
||||
}
|
||||
}
|
71
gpushare-scheduler-extender/pkg/scheduler/gpushare-bind.go
Normal file
71
gpushare-scheduler-extender/pkg/scheduler/gpushare-bind.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
const (
|
||||
OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
|
||||
)
|
||||
|
||||
func NewGPUShareBind(ctx context.Context, clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Bind {
|
||||
return &Bind{
|
||||
Name: "gpusharingbinding",
|
||||
Func: func(name string, namespace string, podUID types.UID, node string, c *cache.SchedulerCache) error {
|
||||
pod, err := getPod(ctx, name, namespace, podUID, clientset, c)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
|
||||
return err
|
||||
}
|
||||
|
||||
nodeInfo, err := c.GetNodeInfo(node)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
|
||||
return err
|
||||
}
|
||||
err = nodeInfo.Allocate(clientset, pod)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
cache: c,
|
||||
}
|
||||
}
|
||||
|
||||
func getPod(ctx context.Context, name string, namespace string, podUID types.UID, clientset *kubernetes.Clientset, c *cache.SchedulerCache) (pod *v1.Pod, err error) {
|
||||
pod, err = c.GetPod(name, namespace)
|
||||
if errors.IsNotFound(err) {
|
||||
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if pod.UID != podUID {
|
||||
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if pod.UID != podUID {
|
||||
return nil, fmt.Errorf("The pod %s in ns %s's uid is %v, and it's not equal with expected %v",
|
||||
name,
|
||||
namespace,
|
||||
pod.UID,
|
||||
podUID)
|
||||
}
|
||||
}
|
||||
|
||||
return pod, nil
|
||||
}
|
@@ -0,0 +1,42 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
)
|
||||
|
||||
func NewGPUShareInspect(c *cache.SchedulerCache) *Inspect {
|
||||
return &Inspect{
|
||||
Name: "gpushareinspect",
|
||||
cache: c,
|
||||
}
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
Nodes []*Node `json:"nodes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type Node struct {
|
||||
Name string `json:"name"`
|
||||
TotalGPU uint `json:"totalGPU"`
|
||||
UsedGPU uint `json:"usedGPU"`
|
||||
Devices []*Device `json:"devs"`
|
||||
}
|
||||
|
||||
type Device struct {
|
||||
ID int `json:"id"`
|
||||
TotalGPU uint `json:"totalGPU"`
|
||||
UsedGPU uint `json:"usedGPU"`
|
||||
Pods []*Pod `json:"pods"`
|
||||
}
|
||||
|
||||
type Pod struct {
|
||||
Name string `json:"name"`
|
||||
Namespace string `json:"namespace"`
|
||||
UsedGPU int `json:"usedGPU"`
|
||||
}
|
||||
|
||||
type Inspect struct {
|
||||
Name string
|
||||
cache *cache.SchedulerCache
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
func NewGPUsharePredicate(clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Predicate {
|
||||
return &Predicate{Name: "gpusharingfilter", cache: c}
|
||||
}
|
69
gpushare-scheduler-extender/pkg/scheduler/inspect.go
Normal file
69
gpushare-scheduler-extender/pkg/scheduler/inspect.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
)
|
||||
|
||||
func (in Inspect) Handler(name string) *Result {
|
||||
nodes := []*Node{}
|
||||
errMsg := ""
|
||||
if len(name) == 0 {
|
||||
nodeInfos := in.cache.GetNodeinfos()
|
||||
for _, info := range nodeInfos {
|
||||
nodes = append(nodes, buildNode(info))
|
||||
}
|
||||
|
||||
} else {
|
||||
node, err := in.cache.GetNodeInfo(name)
|
||||
if err != nil {
|
||||
errMsg = err.Error()
|
||||
}
|
||||
// nodeInfos = append(nodeInfos, node)
|
||||
nodes = append(nodes, buildNode(node))
|
||||
}
|
||||
|
||||
return &Result{
|
||||
Nodes: nodes,
|
||||
Error: errMsg,
|
||||
}
|
||||
}
|
||||
|
||||
func buildNode(info *cache.NodeInfo) *Node {
|
||||
|
||||
devInfos := info.GetDevs()
|
||||
devs := []*Device{}
|
||||
var usedGPU uint
|
||||
|
||||
for i, devInfo := range devInfos {
|
||||
dev := &Device{
|
||||
ID: i,
|
||||
TotalGPU: devInfo.GetTotalGPUMemory(),
|
||||
UsedGPU: devInfo.GetUsedGPUMemory(),
|
||||
}
|
||||
|
||||
podInfos := devInfo.GetPods()
|
||||
pods := []*Pod{}
|
||||
for _, podInfo := range podInfos {
|
||||
if utils.AssignedNonTerminatedPod(podInfo) {
|
||||
pod := &Pod{
|
||||
Namespace: podInfo.Namespace,
|
||||
Name: podInfo.Name,
|
||||
UsedGPU: utils.GetGPUMemoryFromPodResource(podInfo),
|
||||
}
|
||||
pods = append(pods, pod)
|
||||
}
|
||||
}
|
||||
dev.Pods = pods
|
||||
devs = append(devs, dev)
|
||||
usedGPU += devInfo.GetUsedGPUMemory()
|
||||
}
|
||||
|
||||
return &Node{
|
||||
Name: info.GetName(),
|
||||
TotalGPU: uint(info.GetTotalGPUMemory()),
|
||||
UsedGPU: usedGPU,
|
||||
Devices: devs,
|
||||
}
|
||||
|
||||
}
|
87
gpushare-scheduler-extender/pkg/scheduler/predicate.go
Normal file
87
gpushare-scheduler-extender/pkg/scheduler/predicate.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
"k8s.io/api/core/v1"
|
||||
schedulerapi "k8s.io/kube-scheduler/extender/v1"
|
||||
)
|
||||
|
||||
type Predicate struct {
|
||||
Name string
|
||||
cache *cache.SchedulerCache
|
||||
}
|
||||
|
||||
func (p Predicate) checkNode(pod *v1.Pod, nodeName string, c *cache.SchedulerCache) (*v1.Node, error) {
|
||||
log.V(10).Info("info: check if the pod name %s can be scheduled on node %s", pod.Name, nodeName)
|
||||
nodeInfo, err := c.GetNodeInfo(nodeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
node := nodeInfo.GetNode()
|
||||
if node == nil {
|
||||
return nil, fmt.Errorf("failed get node with name %s", nodeName)
|
||||
}
|
||||
if !utils.IsGPUSharingNode(node) {
|
||||
return nil, fmt.Errorf("The node %s is not for GPU share, need skip", nodeName)
|
||||
}
|
||||
|
||||
allocatable := nodeInfo.Assume(pod)
|
||||
if !allocatable {
|
||||
return nil, fmt.Errorf("Insufficient GPU Memory in one device")
|
||||
} else {
|
||||
log.V(10).Info("info: The pod %s in the namespace %s can be scheduled on %s",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
nodeName)
|
||||
}
|
||||
return node, nil
|
||||
}
|
||||
|
||||
func (p Predicate) Handler(args *schedulerapi.ExtenderArgs) *schedulerapi.ExtenderFilterResult {
|
||||
if args == nil || args.Pod == nil {
|
||||
return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("arg or pod is nil")}
|
||||
}
|
||||
|
||||
pod := args.Pod
|
||||
var nodeNames []string
|
||||
if args.NodeNames != nil {
|
||||
nodeNames = *args.NodeNames
|
||||
log.V(3).Info("extender args NodeNames is not nil, result %+v", nodeNames)
|
||||
} else if args.Nodes != nil {
|
||||
for _, n := range args.Nodes.Items {
|
||||
nodeNames = append(nodeNames, n.Name)
|
||||
}
|
||||
log.V(3).Info("extender args Nodes is not nil, names is %+v", nodeNames)
|
||||
} else {
|
||||
return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("cannot get node names")}
|
||||
}
|
||||
canSchedule := make([]string, 0, len(nodeNames))
|
||||
canNotSchedule := make(map[string]string)
|
||||
canScheduleNodes := &v1.NodeList{}
|
||||
|
||||
for _, nodeName := range nodeNames {
|
||||
node, err := p.checkNode(pod, nodeName, p.cache)
|
||||
if err != nil {
|
||||
canNotSchedule[nodeName] = err.Error()
|
||||
} else {
|
||||
if node != nil {
|
||||
canSchedule = append(canSchedule, nodeName)
|
||||
canScheduleNodes.Items = append(canScheduleNodes.Items, *node)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result := schedulerapi.ExtenderFilterResult{
|
||||
NodeNames: &canSchedule,
|
||||
Nodes: canScheduleNodes,
|
||||
FailedNodes: canNotSchedule,
|
||||
Error: "",
|
||||
}
|
||||
|
||||
log.V(100).Info("predicate result for %s, is %+v", pod.Name, result)
|
||||
return &result
|
||||
}
|
13
gpushare-scheduler-extender/pkg/utils/const.go
Normal file
13
gpushare-scheduler-extender/pkg/utils/const.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package utils
|
||||
|
||||
const (
|
||||
ResourceName = "rainbond.com/gpu-mem"
|
||||
CountName = "rainbond.com/gpu-count"
|
||||
|
||||
EnvNVGPU = "NVIDIA_VISIBLE_DEVICES"
|
||||
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
|
||||
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
|
||||
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
|
||||
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
|
||||
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
|
||||
)
|
30
gpushare-scheduler-extender/pkg/utils/node.go
Normal file
30
gpushare-scheduler-extender/pkg/utils/node.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package utils
|
||||
|
||||
import "k8s.io/api/core/v1"
|
||||
|
||||
// Is the Node for GPU sharing
|
||||
func IsGPUSharingNode(node *v1.Node) bool {
|
||||
return GetTotalGPUMemory(node) > 0
|
||||
}
|
||||
|
||||
// Get the total GPU memory of the Node
|
||||
func GetTotalGPUMemory(node *v1.Node) int {
|
||||
val, ok := node.Status.Capacity[ResourceName]
|
||||
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
return int(val.Value())
|
||||
}
|
||||
|
||||
// Get the GPU count of the node
|
||||
func GetGPUCountInNode(node *v1.Node) int {
|
||||
val, ok := node.Status.Capacity[CountName]
|
||||
|
||||
if !ok {
|
||||
return int(0)
|
||||
}
|
||||
|
||||
return int(val.Value())
|
||||
}
|
219
gpushare-scheduler-extender/pkg/utils/pod.go
Normal file
219
gpushare-scheduler-extender/pkg/utils/pod.go
Normal file
@@ -0,0 +1,219 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
// AssignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running).
|
||||
func AssignedNonTerminatedPod(pod *v1.Pod) bool {
|
||||
if pod.DeletionTimestamp != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if len(pod.Spec.NodeName) == 0 {
|
||||
return false
|
||||
}
|
||||
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsCompletePod determines if the pod is complete
|
||||
func IsCompletePod(pod *v1.Pod) bool {
|
||||
if pod.DeletionTimestamp != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// IsGPUsharingPod determines if it's the pod for GPU sharing
|
||||
func IsGPUsharingPod(pod *v1.Pod) bool {
|
||||
return GetGPUMemoryFromPodResource(pod) > 0
|
||||
}
|
||||
|
||||
// GetGPUIDFromAnnotation gets GPU ID from Annotation
|
||||
func GetGPUIDFromAnnotation(pod *v1.Pod) int {
|
||||
id := -1
|
||||
if len(pod.ObjectMeta.Annotations) > 0 {
|
||||
value, found := pod.ObjectMeta.Annotations[EnvResourceIndex]
|
||||
if found {
|
||||
var err error
|
||||
id, err = strconv.Atoi(value)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed due to %v for pod %s in ns %s", err, pod.Name, pod.Namespace)
|
||||
id = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return id
|
||||
}
|
||||
|
||||
// GetGPUIDFromEnv gets GPU ID from Env
|
||||
func GetGPUIDFromEnv(pod *v1.Pod) int {
|
||||
id := -1
|
||||
for _, container := range pod.Spec.Containers {
|
||||
id = getGPUIDFromContainer(container)
|
||||
if id >= 0 {
|
||||
return id
|
||||
}
|
||||
}
|
||||
|
||||
return id
|
||||
}
|
||||
|
||||
func getGPUIDFromContainer(container v1.Container) (devIdx int) {
|
||||
devIdx = -1
|
||||
var err error
|
||||
loop:
|
||||
for _, env := range container.Env {
|
||||
if env.Name == EnvResourceIndex {
|
||||
devIdx, err = strconv.Atoi(env.Value)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed due to %v for %s", err, container.Name)
|
||||
devIdx = -1
|
||||
}
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
return devIdx
|
||||
}
|
||||
|
||||
// GetGPUMemoryFromPodAnnotation gets the GPU Memory of the pod, choose the larger one between gpu memory and gpu init container memory
|
||||
func GetGPUMemoryFromPodAnnotation(pod *v1.Pod) (gpuMemory uint) {
|
||||
if len(pod.ObjectMeta.Annotations) > 0 {
|
||||
value, found := pod.ObjectMeta.Annotations[EnvResourceByPod]
|
||||
if found {
|
||||
s, _ := strconv.Atoi(value)
|
||||
if s < 0 {
|
||||
s = 0
|
||||
}
|
||||
|
||||
gpuMemory += uint(s)
|
||||
}
|
||||
}
|
||||
|
||||
log.V(100).Info("debug: pod %s in ns %s with status %v has GPU Mem %d",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
pod.Status.Phase,
|
||||
gpuMemory)
|
||||
return gpuMemory
|
||||
}
|
||||
|
||||
// GetGPUMemoryFromPodEnv gets the GPU Memory of the pod, choose the larger one between gpu memory and gpu init container memory
|
||||
func GetGPUMemoryFromPodEnv(pod *v1.Pod) (gpuMemory uint) {
|
||||
for _, container := range pod.Spec.Containers {
|
||||
gpuMemory += getGPUMemoryFromContainerEnv(container)
|
||||
}
|
||||
log.V(100).Info("debug: pod %s in ns %s with status %v has GPU Mem %d",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
pod.Status.Phase,
|
||||
gpuMemory)
|
||||
return gpuMemory
|
||||
}
|
||||
|
||||
func getGPUMemoryFromContainerEnv(container v1.Container) (gpuMemory uint) {
|
||||
gpuMemory = 0
|
||||
loop:
|
||||
for _, env := range container.Env {
|
||||
if env.Name == EnvResourceByPod {
|
||||
s, _ := strconv.Atoi(env.Value)
|
||||
if s < 0 {
|
||||
s = 0
|
||||
}
|
||||
gpuMemory = uint(s)
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
return gpuMemory
|
||||
}
|
||||
|
||||
// GetGPUMemoryFromPodResource gets GPU Memory of the Pod
|
||||
func GetGPUMemoryFromPodResource(pod *v1.Pod) int {
|
||||
var total int
|
||||
containers := pod.Spec.Containers
|
||||
for _, container := range containers {
|
||||
if val, ok := container.Resources.Limits[ResourceName]; ok {
|
||||
total += int(val.Value())
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// GetGPUMemoryFromPodResource gets GPU Memory of the Container
|
||||
func GetGPUMemoryFromContainerResource(container v1.Container) int {
|
||||
var total int
|
||||
if val, ok := container.Resources.Limits[ResourceName]; ok {
|
||||
total += int(val.Value())
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// GetUpdatedPodEnvSpec updates pod env with devId
|
||||
func GetUpdatedPodEnvSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) (newPod *v1.Pod) {
|
||||
newPod = oldPod.DeepCopy()
|
||||
for i, c := range newPod.Spec.Containers {
|
||||
gpuMem := GetGPUMemoryFromContainerResource(c)
|
||||
|
||||
if gpuMem > 0 {
|
||||
envs := []v1.EnvVar{
|
||||
// v1.EnvVar{Name: EnvNVGPU, Value: fmt.Sprintf("%d", devId)},
|
||||
v1.EnvVar{Name: EnvResourceIndex, Value: fmt.Sprintf("%d", devId)},
|
||||
v1.EnvVar{Name: EnvResourceByPod, Value: fmt.Sprintf("%d", gpuMem)},
|
||||
v1.EnvVar{Name: EnvResourceByDev, Value: fmt.Sprintf("%d", totalGPUMemByDev)},
|
||||
v1.EnvVar{Name: EnvAssignedFlag, Value: "false"},
|
||||
}
|
||||
|
||||
for _, env := range envs {
|
||||
newPod.Spec.Containers[i].Env = append(newPod.Spec.Containers[i].Env,
|
||||
env)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return newPod
|
||||
}
|
||||
|
||||
// GetUpdatedPodAnnotationSpec updates pod env with devId
|
||||
func GetUpdatedPodAnnotationSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) (newPod *v1.Pod) {
|
||||
newPod = oldPod.DeepCopy()
|
||||
if len(newPod.ObjectMeta.Annotations) == 0 {
|
||||
newPod.ObjectMeta.Annotations = map[string]string{}
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
newPod.ObjectMeta.Annotations[EnvResourceIndex] = fmt.Sprintf("%d", devId)
|
||||
newPod.ObjectMeta.Annotations[EnvResourceByDev] = fmt.Sprintf("%d", totalGPUMemByDev)
|
||||
newPod.ObjectMeta.Annotations[EnvResourceByPod] = fmt.Sprintf("%d", GetGPUMemoryFromPodResource(newPod))
|
||||
newPod.ObjectMeta.Annotations[EnvAssignedFlag] = "false"
|
||||
newPod.ObjectMeta.Annotations[EnvResourceAssumeTime] = fmt.Sprintf("%d", now.UnixNano())
|
||||
|
||||
return newPod
|
||||
}
|
||||
|
||||
func PatchPodAnnotationSpec(oldPod *v1.Pod, devId int, totalGPUMemByDev int) ([]byte, error) {
|
||||
now := time.Now()
|
||||
patchAnnotations := map[string]interface{}{
|
||||
"metadata": map[string]map[string]string{"annotations": {
|
||||
EnvResourceIndex: fmt.Sprintf("%d", devId),
|
||||
EnvResourceByDev: fmt.Sprintf("%d", totalGPUMemByDev),
|
||||
EnvResourceByPod: fmt.Sprintf("%d", GetGPUMemoryFromPodResource(oldPod)),
|
||||
EnvAssignedFlag: "false",
|
||||
EnvResourceAssumeTime: fmt.Sprintf("%d", now.UnixNano()),
|
||||
}}}
|
||||
return json.Marshal(patchAnnotations)
|
||||
}
|
30
gpushare-scheduler-extender/pkg/utils/signals/signal.go
Normal file
30
gpushare-scheduler-extender/pkg/utils/signals/signal.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package signals
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
var onlyOneSignalHandler = make(chan struct{})
|
||||
|
||||
var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM}
|
||||
|
||||
// SetupSignalHandler registered for SIGTERM and SIGINT. A stop channel is returned
|
||||
// which is closed on one of these signals. If a second signal is caught, the program
|
||||
// is terminated with exit code 1.
|
||||
func SetupSignalHandler() (stopCh <-chan struct{}) {
|
||||
close(onlyOneSignalHandler) // panics when called twice
|
||||
|
||||
stop := make(chan struct{})
|
||||
c := make(chan os.Signal, 2)
|
||||
signal.Notify(c, shutdownSignals...)
|
||||
go func() {
|
||||
<-c
|
||||
close(stop)
|
||||
<-c
|
||||
os.Exit(1) // second signal. Exit directly.
|
||||
}()
|
||||
|
||||
return stop
|
||||
}
|
Reference in New Issue
Block a user