263 lines
6.8 KiB
Go
263 lines
6.8 KiB
Go
package nvidia
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
|
|
log "github.com/golang/glog"
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/fields"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/rest"
|
|
"k8s.io/client-go/tools/clientcmd"
|
|
nodeutil "k8s.io/kubernetes/pkg/util/node"
|
|
"os"
|
|
"sort"
|
|
"time"
|
|
)
|
|
|
|
var (
|
|
clientset *kubernetes.Clientset
|
|
nodeName string
|
|
retries = 8
|
|
)
|
|
|
|
func kubeInit() {
|
|
kubeconfigFile := os.Getenv("KUBECONFIG")
|
|
var err error
|
|
var config *rest.Config
|
|
|
|
if _, err = os.Stat(kubeconfigFile); err != nil {
|
|
log.V(5).Infof("kubeconfig %s failed to find due to %v", kubeconfigFile, err)
|
|
config, err = rest.InClusterConfig()
|
|
if err != nil {
|
|
log.Fatalf("Failed due to %v", err)
|
|
}
|
|
} else {
|
|
config, err = clientcmd.BuildConfigFromFlags("", kubeconfigFile)
|
|
if err != nil {
|
|
log.Fatalf("Failed due to %v", err)
|
|
}
|
|
}
|
|
|
|
clientset, err = kubernetes.NewForConfig(config)
|
|
if err != nil {
|
|
log.Fatalf("Failed due to %v", err)
|
|
}
|
|
|
|
nodeName = os.Getenv("NODE_NAME")
|
|
if nodeName == "" {
|
|
log.Fatalln("Please set env NODE_NAME")
|
|
}
|
|
|
|
}
|
|
|
|
func disableCGPUIsolationOrNot() (bool, error) {
|
|
disable := false
|
|
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return disable, err
|
|
}
|
|
labels := node.ObjectMeta.Labels
|
|
value, ok := labels[EnvNodeLabelForDisableCGPU]
|
|
if ok && value == "true" {
|
|
log.Infof("enable gpusharing mode and disable cgpu mode")
|
|
disable = true
|
|
}
|
|
return disable, nil
|
|
}
|
|
|
|
func patchGPUCount(gpuCount int) error {
|
|
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if val, ok := node.Status.Capacity[resourceCount]; ok {
|
|
if val.Value() == int64(gpuCount) {
|
|
log.Infof("No need to update Capacity %s", resourceCount)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
newNode := node.DeepCopy()
|
|
newNode.Status.Capacity[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
|
|
newNode.Status.Allocatable[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
|
|
// content := fmt.Sprintf(`[{"op": "add", "path": "/status/capacity/aliyun.com~gpu-count", "value": "%d"}]`, gpuCount)
|
|
// _, err = clientset.CoreV1().Nodes().PatchStatus(nodeName, []byte(content))
|
|
_, _, err = nodeutil.PatchNodeStatus(clientset.CoreV1(), types.NodeName(nodeName), node, newNode)
|
|
if err != nil {
|
|
log.Infof("Failed to update Capacity %s.", resourceCount)
|
|
} else {
|
|
log.Infof("Updated Capacity %s successfully.", resourceCount)
|
|
}
|
|
return err
|
|
}
|
|
|
|
func getPodList(kubeletClient *client.KubeletClient) (*v1.PodList, error) {
|
|
podList, err := kubeletClient.GetNodeRunningPods()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
list, _ := json.Marshal(podList)
|
|
log.V(8).Infof("get pods list %v", string(list))
|
|
|
|
resultPodList := &v1.PodList{}
|
|
for _, metaPod := range podList.Items {
|
|
if metaPod.Status.Phase != v1.PodPending {
|
|
continue
|
|
}
|
|
resultPodList.Items = append(resultPodList.Items, metaPod)
|
|
}
|
|
|
|
if len(resultPodList.Items) == 0 {
|
|
return nil, fmt.Errorf("not found pending pod")
|
|
}
|
|
|
|
return resultPodList, nil
|
|
}
|
|
|
|
func getPodListsByQueryKubelet(kubeletClient *client.KubeletClient) (*v1.PodList, error) {
|
|
podList, err := getPodList(kubeletClient)
|
|
for i := 0; i < retries && err != nil; i++ {
|
|
podList, err = getPodList(kubeletClient)
|
|
log.Warningf("failed to get pending pod list, retry")
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
if err != nil {
|
|
log.Warningf("not found from kubelet /pods api, start to list apiserver")
|
|
podList, err = getPodListsByListAPIServer()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return podList, nil
|
|
}
|
|
|
|
func getPodListsByListAPIServer() (*v1.PodList, error) {
|
|
selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName, "status.phase": "Pending"})
|
|
podList, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
|
|
FieldSelector: selector.String(),
|
|
LabelSelector: labels.Everything().String(),
|
|
})
|
|
for i := 0; i < 3 && err != nil; i++ {
|
|
podList, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
|
|
FieldSelector: selector.String(),
|
|
LabelSelector: labels.Everything().String(),
|
|
})
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName)
|
|
}
|
|
|
|
return podList, nil
|
|
}
|
|
|
|
func getPendingPodsInNode(queryKubelet bool, kubeletClient *client.KubeletClient) ([]v1.Pod, error) {
|
|
// pods, err := m.lister.List(labels.Everything())
|
|
// if err != nil {
|
|
// return nil, err
|
|
// }
|
|
pods := []v1.Pod{}
|
|
|
|
podIDMap := map[types.UID]bool{}
|
|
|
|
var podList *v1.PodList
|
|
var err error
|
|
if queryKubelet {
|
|
podList, err = getPodListsByQueryKubelet(kubeletClient)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
podList, err = getPodListsByListAPIServer()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
log.V(5).Infof("all pod list %v", podList.Items)
|
|
|
|
// if log.V(5) {
|
|
for _, pod := range podList.Items {
|
|
if pod.Spec.NodeName != nodeName {
|
|
log.Warningf("Pod name %s in ns %s is not assigned to node %s as expected, it's placed on node %s ",
|
|
pod.Name,
|
|
pod.Namespace,
|
|
nodeName,
|
|
pod.Spec.NodeName)
|
|
} else {
|
|
log.Infof("list pod %s in ns %s in node %s and status is %s",
|
|
pod.Name,
|
|
pod.Namespace,
|
|
nodeName,
|
|
pod.Status.Phase,
|
|
)
|
|
if _, ok := podIDMap[pod.UID]; !ok {
|
|
pods = append(pods, pod)
|
|
podIDMap[pod.UID] = true
|
|
}
|
|
}
|
|
|
|
}
|
|
// }
|
|
|
|
return pods, nil
|
|
}
|
|
|
|
// pick up the gpushare pod with assigned status is false, and
|
|
func getCandidatePods(queryKubelet bool, client *client.KubeletClient) ([]*v1.Pod, error) {
|
|
candidatePods := []*v1.Pod{}
|
|
allPods, err := getPendingPodsInNode(queryKubelet, client)
|
|
if err != nil {
|
|
return candidatePods, err
|
|
}
|
|
for _, pod := range allPods {
|
|
current := pod
|
|
if isGPUMemoryAssumedPod(¤t) {
|
|
candidatePods = append(candidatePods, ¤t)
|
|
}
|
|
}
|
|
|
|
if log.V(4) {
|
|
for _, pod := range candidatePods {
|
|
log.Infof("candidate pod %s in ns %s with timestamp %d is found.",
|
|
pod.Name,
|
|
pod.Namespace,
|
|
getAssumeTimeFromPodAnnotation(pod))
|
|
}
|
|
}
|
|
|
|
return makePodOrderdByAge(candidatePods), nil
|
|
}
|
|
|
|
// make the pod ordered by GPU assumed time
|
|
func makePodOrderdByAge(pods []*v1.Pod) []*v1.Pod {
|
|
newPodList := make(orderedPodByAssumeTime, 0, len(pods))
|
|
for _, v := range pods {
|
|
newPodList = append(newPodList, v)
|
|
}
|
|
sort.Sort(newPodList)
|
|
return []*v1.Pod(newPodList)
|
|
}
|
|
|
|
type orderedPodByAssumeTime []*v1.Pod
|
|
|
|
func (this orderedPodByAssumeTime) Len() int {
|
|
return len(this)
|
|
}
|
|
|
|
func (this orderedPodByAssumeTime) Less(i, j int) bool {
|
|
return getAssumeTimeFromPodAnnotation(this[i]) <= getAssumeTimeFromPodAnnotation(this[j])
|
|
}
|
|
|
|
func (this orderedPodByAssumeTime) Swap(i, j int) {
|
|
this[i], this[j] = this[j], this[i]
|
|
}
|