synchronization
This commit is contained in:
26
gpushare-scheduler-extender/pkg/scheduler/bind.go
Normal file
26
gpushare-scheduler-extender/pkg/scheduler/bind.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
schedulerapi "k8s.io/kube-scheduler/extender/v1"
|
||||
)
|
||||
|
||||
// Bind is responsible for binding node and pod
|
||||
type Bind struct {
|
||||
Name string
|
||||
Func func(podName string, podNamespace string, podUID types.UID, node string, cache *cache.SchedulerCache) error
|
||||
cache *cache.SchedulerCache
|
||||
}
|
||||
|
||||
// Handler handles the Bind request
|
||||
func (b Bind) Handler(args schedulerapi.ExtenderBindingArgs) *schedulerapi.ExtenderBindingResult {
|
||||
err := b.Func(args.PodName, args.PodNamespace, args.PodUID, args.Node, b.cache)
|
||||
errMsg := ""
|
||||
if err != nil {
|
||||
errMsg = err.Error()
|
||||
}
|
||||
return &schedulerapi.ExtenderBindingResult{
|
||||
Error: errMsg,
|
||||
}
|
||||
}
|
71
gpushare-scheduler-extender/pkg/scheduler/gpushare-bind.go
Normal file
71
gpushare-scheduler-extender/pkg/scheduler/gpushare-bind.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
const (
|
||||
OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
|
||||
)
|
||||
|
||||
func NewGPUShareBind(ctx context.Context, clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Bind {
|
||||
return &Bind{
|
||||
Name: "gpusharingbinding",
|
||||
Func: func(name string, namespace string, podUID types.UID, node string, c *cache.SchedulerCache) error {
|
||||
pod, err := getPod(ctx, name, namespace, podUID, clientset, c)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
|
||||
return err
|
||||
}
|
||||
|
||||
nodeInfo, err := c.GetNodeInfo(node)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
|
||||
return err
|
||||
}
|
||||
err = nodeInfo.Allocate(clientset, pod)
|
||||
if err != nil {
|
||||
log.V(9).Info("warn: Failed to handle pod %s in ns %s due to error %v", name, namespace, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
cache: c,
|
||||
}
|
||||
}
|
||||
|
||||
func getPod(ctx context.Context, name string, namespace string, podUID types.UID, clientset *kubernetes.Clientset, c *cache.SchedulerCache) (pod *v1.Pod, err error) {
|
||||
pod, err = c.GetPod(name, namespace)
|
||||
if errors.IsNotFound(err) {
|
||||
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if pod.UID != podUID {
|
||||
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if pod.UID != podUID {
|
||||
return nil, fmt.Errorf("The pod %s in ns %s's uid is %v, and it's not equal with expected %v",
|
||||
name,
|
||||
namespace,
|
||||
pod.UID,
|
||||
podUID)
|
||||
}
|
||||
}
|
||||
|
||||
return pod, nil
|
||||
}
|
@@ -0,0 +1,42 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
)
|
||||
|
||||
func NewGPUShareInspect(c *cache.SchedulerCache) *Inspect {
|
||||
return &Inspect{
|
||||
Name: "gpushareinspect",
|
||||
cache: c,
|
||||
}
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
Nodes []*Node `json:"nodes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type Node struct {
|
||||
Name string `json:"name"`
|
||||
TotalGPU uint `json:"totalGPU"`
|
||||
UsedGPU uint `json:"usedGPU"`
|
||||
Devices []*Device `json:"devs"`
|
||||
}
|
||||
|
||||
type Device struct {
|
||||
ID int `json:"id"`
|
||||
TotalGPU uint `json:"totalGPU"`
|
||||
UsedGPU uint `json:"usedGPU"`
|
||||
Pods []*Pod `json:"pods"`
|
||||
}
|
||||
|
||||
type Pod struct {
|
||||
Name string `json:"name"`
|
||||
Namespace string `json:"namespace"`
|
||||
UsedGPU int `json:"usedGPU"`
|
||||
}
|
||||
|
||||
type Inspect struct {
|
||||
Name string
|
||||
cache *cache.SchedulerCache
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
func NewGPUsharePredicate(clientset *kubernetes.Clientset, c *cache.SchedulerCache) *Predicate {
|
||||
return &Predicate{Name: "gpusharingfilter", cache: c}
|
||||
}
|
69
gpushare-scheduler-extender/pkg/scheduler/inspect.go
Normal file
69
gpushare-scheduler-extender/pkg/scheduler/inspect.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
)
|
||||
|
||||
func (in Inspect) Handler(name string) *Result {
|
||||
nodes := []*Node{}
|
||||
errMsg := ""
|
||||
if len(name) == 0 {
|
||||
nodeInfos := in.cache.GetNodeinfos()
|
||||
for _, info := range nodeInfos {
|
||||
nodes = append(nodes, buildNode(info))
|
||||
}
|
||||
|
||||
} else {
|
||||
node, err := in.cache.GetNodeInfo(name)
|
||||
if err != nil {
|
||||
errMsg = err.Error()
|
||||
}
|
||||
// nodeInfos = append(nodeInfos, node)
|
||||
nodes = append(nodes, buildNode(node))
|
||||
}
|
||||
|
||||
return &Result{
|
||||
Nodes: nodes,
|
||||
Error: errMsg,
|
||||
}
|
||||
}
|
||||
|
||||
func buildNode(info *cache.NodeInfo) *Node {
|
||||
|
||||
devInfos := info.GetDevs()
|
||||
devs := []*Device{}
|
||||
var usedGPU uint
|
||||
|
||||
for i, devInfo := range devInfos {
|
||||
dev := &Device{
|
||||
ID: i,
|
||||
TotalGPU: devInfo.GetTotalGPUMemory(),
|
||||
UsedGPU: devInfo.GetUsedGPUMemory(),
|
||||
}
|
||||
|
||||
podInfos := devInfo.GetPods()
|
||||
pods := []*Pod{}
|
||||
for _, podInfo := range podInfos {
|
||||
if utils.AssignedNonTerminatedPod(podInfo) {
|
||||
pod := &Pod{
|
||||
Namespace: podInfo.Namespace,
|
||||
Name: podInfo.Name,
|
||||
UsedGPU: utils.GetGPUMemoryFromPodResource(podInfo),
|
||||
}
|
||||
pods = append(pods, pod)
|
||||
}
|
||||
}
|
||||
dev.Pods = pods
|
||||
devs = append(devs, dev)
|
||||
usedGPU += devInfo.GetUsedGPUMemory()
|
||||
}
|
||||
|
||||
return &Node{
|
||||
Name: info.GetName(),
|
||||
TotalGPU: uint(info.GetTotalGPUMemory()),
|
||||
UsedGPU: usedGPU,
|
||||
Devices: devs,
|
||||
}
|
||||
|
||||
}
|
87
gpushare-scheduler-extender/pkg/scheduler/predicate.go
Normal file
87
gpushare-scheduler-extender/pkg/scheduler/predicate.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/cache"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/log"
|
||||
"github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/utils"
|
||||
"k8s.io/api/core/v1"
|
||||
schedulerapi "k8s.io/kube-scheduler/extender/v1"
|
||||
)
|
||||
|
||||
type Predicate struct {
|
||||
Name string
|
||||
cache *cache.SchedulerCache
|
||||
}
|
||||
|
||||
func (p Predicate) checkNode(pod *v1.Pod, nodeName string, c *cache.SchedulerCache) (*v1.Node, error) {
|
||||
log.V(10).Info("info: check if the pod name %s can be scheduled on node %s", pod.Name, nodeName)
|
||||
nodeInfo, err := c.GetNodeInfo(nodeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
node := nodeInfo.GetNode()
|
||||
if node == nil {
|
||||
return nil, fmt.Errorf("failed get node with name %s", nodeName)
|
||||
}
|
||||
if !utils.IsGPUSharingNode(node) {
|
||||
return nil, fmt.Errorf("The node %s is not for GPU share, need skip", nodeName)
|
||||
}
|
||||
|
||||
allocatable := nodeInfo.Assume(pod)
|
||||
if !allocatable {
|
||||
return nil, fmt.Errorf("Insufficient GPU Memory in one device")
|
||||
} else {
|
||||
log.V(10).Info("info: The pod %s in the namespace %s can be scheduled on %s",
|
||||
pod.Name,
|
||||
pod.Namespace,
|
||||
nodeName)
|
||||
}
|
||||
return node, nil
|
||||
}
|
||||
|
||||
func (p Predicate) Handler(args *schedulerapi.ExtenderArgs) *schedulerapi.ExtenderFilterResult {
|
||||
if args == nil || args.Pod == nil {
|
||||
return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("arg or pod is nil")}
|
||||
}
|
||||
|
||||
pod := args.Pod
|
||||
var nodeNames []string
|
||||
if args.NodeNames != nil {
|
||||
nodeNames = *args.NodeNames
|
||||
log.V(3).Info("extender args NodeNames is not nil, result %+v", nodeNames)
|
||||
} else if args.Nodes != nil {
|
||||
for _, n := range args.Nodes.Items {
|
||||
nodeNames = append(nodeNames, n.Name)
|
||||
}
|
||||
log.V(3).Info("extender args Nodes is not nil, names is %+v", nodeNames)
|
||||
} else {
|
||||
return &schedulerapi.ExtenderFilterResult{Error: fmt.Sprintf("cannot get node names")}
|
||||
}
|
||||
canSchedule := make([]string, 0, len(nodeNames))
|
||||
canNotSchedule := make(map[string]string)
|
||||
canScheduleNodes := &v1.NodeList{}
|
||||
|
||||
for _, nodeName := range nodeNames {
|
||||
node, err := p.checkNode(pod, nodeName, p.cache)
|
||||
if err != nil {
|
||||
canNotSchedule[nodeName] = err.Error()
|
||||
} else {
|
||||
if node != nil {
|
||||
canSchedule = append(canSchedule, nodeName)
|
||||
canScheduleNodes.Items = append(canScheduleNodes.Items, *node)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result := schedulerapi.ExtenderFilterResult{
|
||||
NodeNames: &canSchedule,
|
||||
Nodes: canScheduleNodes,
|
||||
FailedNodes: canNotSchedule,
|
||||
Error: "",
|
||||
}
|
||||
|
||||
log.V(100).Info("predicate result for %s, is %+v", pod.Name, result)
|
||||
return &result
|
||||
}
|
Reference in New Issue
Block a user