synchronization

This commit is contained in:
2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions

View File

@@ -0,0 +1,198 @@
package nvidia
import (
"fmt"
"time"
log "github.com/golang/glog"
"golang.org/x/net/context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
var (
clientTimeout = 30 * time.Second
lastAllocateTime time.Time
)
// create docker client
func init() {
kubeInit()
}
func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
responses := pluginapi.AllocateResponse{}
for _, req := range reqs.ContainerRequests {
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
envNVGPU: fmt.Sprintf("no-gpu-has-%d%s-to-run", podReqGPU, metric),
EnvResourceIndex: fmt.Sprintf("-1"),
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
return &responses
}
// Allocate which return list of devices.
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
responses := pluginapi.AllocateResponse{}
log.Infoln("----Allocating GPU for gpu mem is started----")
var (
podReqGPU uint
found bool
assumePod *v1.Pod
)
// podReqGPU = uint(0)
for _, req := range reqs.ContainerRequests {
podReqGPU += uint(len(req.DevicesIDs))
}
log.Infof("RequestPodGPUs: %d", podReqGPU)
m.Lock()
defer m.Unlock()
log.Infoln("checking...")
pods, err := getCandidatePods(m.queryKubelet, m.kubeletClient)
if err != nil {
log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
}
if log.V(4) {
for _, pod := range pods {
log.Infof("Pod %s in ns %s request GPU Memory %d with timestamp %v",
pod.Name,
pod.Namespace,
getGPUMemoryFromPodResource(pod),
getAssumeTimeFromPodAnnotation(pod))
}
}
for _, pod := range pods {
if getGPUMemoryFromPodResource(pod) == podReqGPU {
log.Infof("Found Assumed GPU shared Pod %s in ns %s with GPU Memory %d",
pod.Name,
pod.Namespace,
podReqGPU)
assumePod = pod
found = true
break
}
}
if found {
id := getGPUIDFromPodAnnotation(assumePod)
if id < 0 {
log.Warningf("Failed to get the dev ", assumePod)
}
candidateDevID := ""
if id >= 0 {
ok := false
candidateDevID, ok = m.GetDeviceNameByIndex(uint(id))
if !ok {
log.Warningf("Failed to find the dev for pod %v because it's not able to find dev with index %d",
assumePod,
id)
id = -1
}
}
if id < 0 {
return buildErrResponse(reqs, podReqGPU), nil
}
log.Infof("gpu index %v,uuid: %v", id, candidateDevID)
// 1. Create container requests
for _, req := range reqs.ContainerRequests {
reqGPU := uint(len(req.DevicesIDs))
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
envNVGPU: fmt.Sprintf("%v", id),
EnvResourceIndex: fmt.Sprintf("%d", id),
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
EnvResourceByContainer: fmt.Sprintf("%d", reqGPU),
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
if m.disableCGPUIsolation {
response.Envs["CGPU_DISABLE"] = "true"
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
// 2. Update Pod spec
patchedAnnotationBytes, err := patchPodAnnotationSpecAssigned()
if err != nil {
return buildErrResponse(reqs, podReqGPU), nil
}
_, err = clientset.CoreV1().Pods(assumePod.Namespace).Patch(assumePod.Name, types.StrategicMergePatchType, patchedAnnotationBytes)
if err != nil {
// the object has been modified; please apply your changes to the latest version and try again
if err.Error() == OptimisticLockErrorMsg {
// retry
_, err = clientset.CoreV1().Pods(assumePod.Namespace).Patch(assumePod.Name, types.StrategicMergePatchType, patchedAnnotationBytes)
if err != nil {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
}
} else {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
}
}
} else if len(m.devNameMap) == 1 {
var devName string
var devIndex uint
for d, index := range m.devNameMap {
devName = d
devIndex = index
break
}
log.Infof("this node has only one gpu device,skip to search pod and directly specify the device %v(%v) for container", devIndex, devName)
for _, req := range reqs.ContainerRequests {
reqGPU := uint(len(req.DevicesIDs))
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
envNVGPU: devName,
EnvResourceIndex: fmt.Sprintf("%d", devIndex),
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
EnvResourceByContainer: fmt.Sprintf("%d", reqGPU),
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
if m.disableCGPUIsolation {
response.Envs["CGPU_DISABLE"] = "true"
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
log.Infof("get allocated GPUs info %v", responses)
return &responses, nil
} else {
log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.",
podReqGPU)
// return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU)
return buildErrResponse(reqs, podReqGPU), nil
}
podName := ""
if assumePod != nil {
podName = assumePod.Name
}
log.Infof("pod %v, new allocated GPUs info %v", podName, &responses)
log.Infof("----Allocating GPU for gpu mem for %v is ended----", podName)
// // Add this to make sure the container is created at least
// currentTime := time.Now()
// currentTime.Sub(lastAllocateTime)
return &responses, nil
}

View File

@@ -0,0 +1,36 @@
package nvidia
import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
// MemoryUnit describes GPU Memory, now only supports Gi, Mi
type MemoryUnit string
const (
resourceName = "rainbond.com/gpu-mem"
resourceCount = "rainbond.com/gpu-count"
serverSock = pluginapi.DevicePluginPath + "aliyungpushare.sock"
OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
allHealthChecks = "xids"
containerTypeLabelKey = "io.kubernetes.docker.type"
containerTypeLabelSandbox = "podsandbox"
containerTypeLabelContainer = "container"
containerLogPathLabelKey = "io.kubernetes.container.logpath"
sandboxIDLabelKey = "io.kubernetes.sandbox.id"
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
EnvNodeLabelForDisableCGPU = "cgpu.disable.isolation"
GiBPrefix = MemoryUnit("GiB")
MiBPrefix = MemoryUnit("MiB")
)

View File

@@ -0,0 +1,30 @@
package nvidia
import (
"io/ioutil"
"runtime"
log "github.com/golang/glog"
)
func StackTrace(all bool) string {
buf := make([]byte, 10240)
for {
size := runtime.Stack(buf, all)
if size == len(buf) {
buf = make([]byte, len(buf)<<1)
continue
}
break
}
return string(buf)
}
func coredump(fileName string) {
log.Infoln("Dump stacktrace to ", fileName)
ioutil.WriteFile(fileName, []byte(StackTrace(true)), 0644)
}

View File

@@ -0,0 +1,111 @@
package nvidia
import (
"fmt"
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
"syscall"
"os"
"time"
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/fsnotify/fsnotify"
log "github.com/golang/glog"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
type sharedGPUManager struct {
enableMPS bool
healthCheck bool
queryKubelet bool
kubeletClient *client.KubeletClient
}
func NewSharedGPUManager(enableMPS, healthCheck, queryKubelet bool, bp MemoryUnit, client *client.KubeletClient) *sharedGPUManager {
metric = bp
return &sharedGPUManager{
enableMPS: enableMPS,
healthCheck: healthCheck,
queryKubelet: queryKubelet,
kubeletClient: client,
}
}
func (ngm *sharedGPUManager) Run() error {
log.V(1).Infoln("Loading NVML")
if err := nvml.Init(); err != nil {
log.V(1).Infof("Failed to initialize NVML: %s.", err)
log.V(1).Infof("If this is a GPU node, did you set the docker default runtime to `nvidia`?")
select {}
}
defer func() { log.V(1).Infoln("Shutdown of NVML returned:", nvml.Shutdown()) }()
log.V(1).Infoln("Fetching devices.")
if getDeviceCount() == uint(0) {
log.V(1).Infoln("No devices found. Waiting indefinitely.")
select {}
}
log.V(1).Infoln("Starting FS watcher.")
watcher, err := newFSWatcher(pluginapi.DevicePluginPath)
if err != nil {
log.V(1).Infoln("Failed to created FS watcher.")
return err
}
defer watcher.Close()
log.V(1).Infoln("Starting OS watcher.")
sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
restart := true
var devicePlugin *NvidiaDevicePlugin
L:
for {
if restart {
if devicePlugin != nil {
devicePlugin.Stop()
}
devicePlugin, err = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.queryKubelet, ngm.kubeletClient)
if err != nil {
log.Warningf("Failed to get device plugin due to %v", err)
os.Exit(1)
} else if err = devicePlugin.Serve(); err != nil {
log.Warningf("Failed to start device plugin due to %v", err)
os.Exit(2)
} else {
restart = false
}
}
select {
case event := <-watcher.Events:
if event.Name == pluginapi.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create {
log.V(1).Infof("inotify: %s created, restarting.", pluginapi.KubeletSocket)
restart = true
}
case err := <-watcher.Errors:
log.Warningf("inotify: %s", err)
case s := <-sigs:
switch s {
case syscall.SIGHUP:
log.V(1).Infoln("Received SIGHUP, restarting.")
restart = true
case syscall.SIGQUIT:
t := time.Now()
timestamp := fmt.Sprint(t.Format("20060102150405"))
log.Infoln("generate core dump")
coredump("/etc/kubernetes/go_" + timestamp + ".txt")
default:
log.V(1).Infof("Received signal \"%v\", shutting down.", s)
devicePlugin.Stop()
break L
}
}
}
return nil
}

View File

@@ -0,0 +1,152 @@
package nvidia
import (
"fmt"
"strings"
log "github.com/golang/glog"
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"golang.org/x/net/context"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
var (
gpuMemory uint
metric MemoryUnit
)
func check(err error) {
if err != nil {
log.Fatalln("Fatal:", err)
}
}
func generateFakeDeviceID(realID string, fakeCounter uint) string {
return fmt.Sprintf("%s-_-%d", realID, fakeCounter)
}
func extractRealDeviceID(fakeDeviceID string) string {
return strings.Split(fakeDeviceID, "-_-")[0]
}
func setGPUMemory(raw uint) {
v := raw
if metric == GiBPrefix {
v = raw / 1024
}
gpuMemory = v
log.Infof("set gpu memory: %d", gpuMemory)
}
func getGPUMemory() uint {
return gpuMemory
}
func getDeviceCount() uint {
n, err := nvml.GetDeviceCount()
check(err)
return n
}
func getDevices() ([]*pluginapi.Device, map[string]uint) {
n, err := nvml.GetDeviceCount()
check(err)
var devs []*pluginapi.Device
realDevNames := map[string]uint{}
for i := uint(0); i < n; i++ {
d, err := nvml.NewDevice(i)
check(err)
// realDevNames = append(realDevNames, d.UUID)
var id uint
log.Infof("Deivce %s's Path is %s", d.UUID, d.Path)
_, err = fmt.Sscanf(d.Path, "/dev/nvidia%d", &id)
check(err)
realDevNames[d.UUID] = id
// var KiB uint64 = 1024
log.Infof("# device Memory: %d", uint(*d.Memory))
if getGPUMemory() == uint(0) {
setGPUMemory(uint(*d.Memory))
}
for j := uint(0); j < getGPUMemory(); j++ {
fakeID := generateFakeDeviceID(d.UUID, j)
if j == 0 {
log.Infoln("# Add first device ID: " + fakeID)
}
if j == getGPUMemory()-1 {
log.Infoln("# Add last device ID: " + fakeID)
}
devs = append(devs, &pluginapi.Device{
ID: fakeID,
Health: pluginapi.Healthy,
})
}
}
return devs, realDevNames
}
func deviceExists(devs []*pluginapi.Device, id string) bool {
for _, d := range devs {
if d.ID == id {
return true
}
}
return false
}
func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
eventSet := nvml.NewEventSet()
defer nvml.DeleteEventSet(eventSet)
for _, d := range devs {
realDeviceID := extractRealDeviceID(d.ID)
err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, realDeviceID)
if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
log.Infof("Warning: %s (%s) is too old to support healthchecking: %s. Marking it unhealthy.", realDeviceID, d.ID, err)
xids <- d
continue
}
if err != nil {
log.Fatalf("Fatal error:", err)
}
}
for {
select {
case <-ctx.Done():
return
default:
}
e, err := nvml.WaitForEvent(eventSet, 5000)
if err != nil && e.Etype != nvml.XidCriticalError {
continue
}
// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
continue
}
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
for _, d := range devs {
xids <- d
}
continue
}
for _, d := range devs {
if extractRealDeviceID(d.ID) == *e.UUID {
xids <- d
}
}
}
}

View File

@@ -0,0 +1,262 @@
package nvidia
import (
"encoding/json"
"fmt"
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
log "github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
nodeutil "k8s.io/kubernetes/pkg/util/node"
"os"
"sort"
"time"
)
var (
clientset *kubernetes.Clientset
nodeName string
retries = 8
)
func kubeInit() {
kubeconfigFile := os.Getenv("KUBECONFIG")
var err error
var config *rest.Config
if _, err = os.Stat(kubeconfigFile); err != nil {
log.V(5).Infof("kubeconfig %s failed to find due to %v", kubeconfigFile, err)
config, err = rest.InClusterConfig()
if err != nil {
log.Fatalf("Failed due to %v", err)
}
} else {
config, err = clientcmd.BuildConfigFromFlags("", kubeconfigFile)
if err != nil {
log.Fatalf("Failed due to %v", err)
}
}
clientset, err = kubernetes.NewForConfig(config)
if err != nil {
log.Fatalf("Failed due to %v", err)
}
nodeName = os.Getenv("NODE_NAME")
if nodeName == "" {
log.Fatalln("Please set env NODE_NAME")
}
}
func disableCGPUIsolationOrNot() (bool, error) {
disable := false
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
if err != nil {
return disable, err
}
labels := node.ObjectMeta.Labels
value, ok := labels[EnvNodeLabelForDisableCGPU]
if ok && value == "true" {
log.Infof("enable gpusharing mode and disable cgpu mode")
disable = true
}
return disable, nil
}
func patchGPUCount(gpuCount int) error {
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
if err != nil {
return err
}
if val, ok := node.Status.Capacity[resourceCount]; ok {
if val.Value() == int64(gpuCount) {
log.Infof("No need to update Capacity %s", resourceCount)
return nil
}
}
newNode := node.DeepCopy()
newNode.Status.Capacity[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
newNode.Status.Allocatable[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
// content := fmt.Sprintf(`[{"op": "add", "path": "/status/capacity/aliyun.com~gpu-count", "value": "%d"}]`, gpuCount)
// _, err = clientset.CoreV1().Nodes().PatchStatus(nodeName, []byte(content))
_, _, err = nodeutil.PatchNodeStatus(clientset.CoreV1(), types.NodeName(nodeName), node, newNode)
if err != nil {
log.Infof("Failed to update Capacity %s.", resourceCount)
} else {
log.Infof("Updated Capacity %s successfully.", resourceCount)
}
return err
}
func getPodList(kubeletClient *client.KubeletClient) (*v1.PodList, error) {
podList, err := kubeletClient.GetNodeRunningPods()
if err != nil {
return nil, err
}
list, _ := json.Marshal(podList)
log.V(8).Infof("get pods list %v", string(list))
resultPodList := &v1.PodList{}
for _, metaPod := range podList.Items {
if metaPod.Status.Phase != v1.PodPending {
continue
}
resultPodList.Items = append(resultPodList.Items, metaPod)
}
if len(resultPodList.Items) == 0 {
return nil, fmt.Errorf("not found pending pod")
}
return resultPodList, nil
}
func getPodListsByQueryKubelet(kubeletClient *client.KubeletClient) (*v1.PodList, error) {
podList, err := getPodList(kubeletClient)
for i := 0; i < retries && err != nil; i++ {
podList, err = getPodList(kubeletClient)
log.Warningf("failed to get pending pod list, retry")
time.Sleep(100 * time.Millisecond)
}
if err != nil {
log.Warningf("not found from kubelet /pods api, start to list apiserver")
podList, err = getPodListsByListAPIServer()
if err != nil {
return nil, err
}
}
return podList, nil
}
func getPodListsByListAPIServer() (*v1.PodList, error) {
selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName, "status.phase": "Pending"})
podList, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
FieldSelector: selector.String(),
LabelSelector: labels.Everything().String(),
})
for i := 0; i < 3 && err != nil; i++ {
podList, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
FieldSelector: selector.String(),
LabelSelector: labels.Everything().String(),
})
time.Sleep(1 * time.Second)
}
if err != nil {
return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName)
}
return podList, nil
}
func getPendingPodsInNode(queryKubelet bool, kubeletClient *client.KubeletClient) ([]v1.Pod, error) {
// pods, err := m.lister.List(labels.Everything())
// if err != nil {
// return nil, err
// }
pods := []v1.Pod{}
podIDMap := map[types.UID]bool{}
var podList *v1.PodList
var err error
if queryKubelet {
podList, err = getPodListsByQueryKubelet(kubeletClient)
if err != nil {
return nil, err
}
} else {
podList, err = getPodListsByListAPIServer()
if err != nil {
return nil, err
}
}
log.V(5).Infof("all pod list %v", podList.Items)
// if log.V(5) {
for _, pod := range podList.Items {
if pod.Spec.NodeName != nodeName {
log.Warningf("Pod name %s in ns %s is not assigned to node %s as expected, it's placed on node %s ",
pod.Name,
pod.Namespace,
nodeName,
pod.Spec.NodeName)
} else {
log.Infof("list pod %s in ns %s in node %s and status is %s",
pod.Name,
pod.Namespace,
nodeName,
pod.Status.Phase,
)
if _, ok := podIDMap[pod.UID]; !ok {
pods = append(pods, pod)
podIDMap[pod.UID] = true
}
}
}
// }
return pods, nil
}
// pick up the gpushare pod with assigned status is false, and
func getCandidatePods(queryKubelet bool, client *client.KubeletClient) ([]*v1.Pod, error) {
candidatePods := []*v1.Pod{}
allPods, err := getPendingPodsInNode(queryKubelet, client)
if err != nil {
return candidatePods, err
}
for _, pod := range allPods {
current := pod
if isGPUMemoryAssumedPod(&current) {
candidatePods = append(candidatePods, &current)
}
}
if log.V(4) {
for _, pod := range candidatePods {
log.Infof("candidate pod %s in ns %s with timestamp %d is found.",
pod.Name,
pod.Namespace,
getAssumeTimeFromPodAnnotation(pod))
}
}
return makePodOrderdByAge(candidatePods), nil
}
// make the pod ordered by GPU assumed time
func makePodOrderdByAge(pods []*v1.Pod) []*v1.Pod {
newPodList := make(orderedPodByAssumeTime, 0, len(pods))
for _, v := range pods {
newPodList = append(newPodList, v)
}
sort.Sort(newPodList)
return []*v1.Pod(newPodList)
}
type orderedPodByAssumeTime []*v1.Pod
func (this orderedPodByAssumeTime) Len() int {
return len(this)
}
func (this orderedPodByAssumeTime) Less(i, j int) bool {
return getAssumeTimeFromPodAnnotation(this[i]) <= getAssumeTimeFromPodAnnotation(this[j])
}
func (this orderedPodByAssumeTime) Swap(i, j int) {
this[i], this[j] = this[j], this[i]
}

View File

@@ -0,0 +1,182 @@
package nvidia
import (
"encoding/json"
"fmt"
"strconv"
"time"
log "github.com/golang/glog"
v1 "k8s.io/api/core/v1"
)
// update pod env with assigned status
func updatePodAnnotations(oldPod *v1.Pod) (newPod *v1.Pod) {
newPod = oldPod.DeepCopy()
if len(newPod.ObjectMeta.Annotations) == 0 {
newPod.ObjectMeta.Annotations = map[string]string{}
}
now := time.Now()
newPod.ObjectMeta.Annotations[EnvAssignedFlag] = "true"
newPod.ObjectMeta.Annotations[EnvResourceAssumeTime] = fmt.Sprintf("%d", now.UnixNano())
return newPod
}
func patchPodAnnotationSpecAssigned() ([]byte, error) {
now := time.Now()
patchAnnotations := map[string]interface{}{
"metadata": map[string]map[string]string{"annotations": {
EnvAssignedFlag: "true",
EnvResourceAssumeTime: fmt.Sprintf("%d", now.UnixNano()),
}}}
return json.Marshal(patchAnnotations)
}
func getGPUIDFromPodAnnotation(pod *v1.Pod) (id int) {
var err error
id = -1
if len(pod.ObjectMeta.Annotations) > 0 {
value, found := pod.ObjectMeta.Annotations[EnvResourceIndex]
if found {
id, err = strconv.Atoi(value)
if err != nil {
log.Warningf("Failed to parse dev id %s due to %v for pod %s in ns %s",
value,
err,
pod.Name,
pod.Namespace)
id = -1
}
} else {
log.Warningf("Failed to get dev id %s for pod %s in ns %s",
pod.Name,
pod.Namespace)
}
}
return id
}
// get assumed timestamp
func getAssumeTimeFromPodAnnotation(pod *v1.Pod) (assumeTime uint64) {
if assumeTimeStr, ok := pod.ObjectMeta.Annotations[EnvResourceAssumeTime]; ok {
u64, err := strconv.ParseUint(assumeTimeStr, 10, 64)
if err != nil {
log.Warningf("Failed to parse assume Timestamp %s due to %v", assumeTimeStr, err)
} else {
assumeTime = u64
}
}
return assumeTime
}
// determine if the pod is GPU share pod, and is already assumed but not assigned
func isGPUMemoryAssumedPod(pod *v1.Pod) (assumed bool) {
log.V(6).Infof("Determine if the pod %v is GPUSharedAssumed pod", pod)
var ok bool
// 1. Check if it's for GPU share
if getGPUMemoryFromPodResource(pod) <= 0 {
log.V(6).Infof("Pod %s in namespace %s has not GPU Memory Request, so it's not GPUSharedAssumed assumed pod.",
pod.Name,
pod.Namespace)
return assumed
}
// 2. Check if it already has assume time
if _, ok = pod.ObjectMeta.Annotations[EnvResourceAssumeTime]; !ok {
log.V(4).Infof("No assume timestamp for pod %s in namespace %s, so it's not GPUSharedAssumed assumed pod.",
pod.Name,
pod.Namespace)
return assumed
}
// 3. Check if it has been assigned already
if assigned, ok := pod.ObjectMeta.Annotations[EnvAssignedFlag]; ok {
if assigned == "false" {
log.V(4).Infof("Found GPUSharedAssumed assumed pod %s in namespace %s.",
pod.Name,
pod.Namespace)
assumed = true
} else {
log.Infof("GPU assigned Flag for pod %s exists in namespace %s and its assigned status is %s, so it's not GPUSharedAssumed assumed pod.",
pod.Name,
pod.Namespace,
assigned)
}
} else {
log.Warningf("No GPU assigned Flag for pod %s in namespace %s, so it's not GPUSharedAssumed assumed pod.",
pod.Name,
pod.Namespace)
}
return assumed
}
// Get GPU Memory of the Pod
func getGPUMemoryFromPodResource(pod *v1.Pod) uint {
var total uint
containers := pod.Spec.Containers
for _, container := range containers {
if val, ok := container.Resources.Limits[resourceName]; ok {
total += uint(val.Value())
}
}
return total
}
func podIsNotRunning(pod v1.Pod) bool {
status := pod.Status
//deletionTimestamp
if pod.DeletionTimestamp != nil {
return true
}
// pod is scheduled but not initialized
if status.Phase == v1.PodPending && podConditionTrueOnly(status.Conditions, v1.PodScheduled) {
log.Infof("Pod %s only has PodScheduled, is not running", pod.Name)
return true
}
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses)) || (status.Phase == v1.PodPending && podConditionTrueOnly(status.Conditions, v1.PodScheduled))
}
// notRunning returns true if every status is terminated or waiting, or the status list
// is empty.
func notRunning(statuses []v1.ContainerStatus) bool {
for _, status := range statuses {
if status.State.Terminated == nil && status.State.Waiting == nil {
return false
}
}
return true
}
func podConditionTrue(conditions []v1.PodCondition, expect v1.PodConditionType) bool {
for _, condition := range conditions {
if condition.Type == expect && condition.Status == v1.ConditionTrue {
return true
}
}
return false
}
func podConditionTrueOnly(conditions []v1.PodCondition, expect v1.PodConditionType) bool {
if len(conditions) != 1 {
return false
}
for _, condition := range conditions {
if condition.Type == expect && condition.Status == v1.ConditionTrue {
return true
}
}
return false
}

View File

@@ -0,0 +1,241 @@
package nvidia
import (
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
"net"
"os"
"path"
"sync"
"time"
log "github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
// NvidiaDevicePlugin implements the Kubernetes device plugin API
type NvidiaDevicePlugin struct {
devs []*pluginapi.Device
realDevNames []string
devNameMap map[string]uint
devIndxMap map[uint]string
socket string
mps bool
healthCheck bool
disableCGPUIsolation bool
stop chan struct{}
health chan *pluginapi.Device
queryKubelet bool
kubeletClient *client.KubeletClient
server *grpc.Server
sync.RWMutex
}
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(mps, healthCheck, queryKubelet bool, client *client.KubeletClient) (*NvidiaDevicePlugin, error) {
devs, devNameMap := getDevices()
devList := []string{}
for dev, _ := range devNameMap {
devList = append(devList, dev)
}
log.Infof("Device Map: %v", devNameMap)
log.Infof("Device List: %v", devList)
err := patchGPUCount(len(devList))
if err != nil {
return nil, err
}
disableCGPUIsolation, err := disableCGPUIsolationOrNot()
if err != nil {
return nil, err
}
return &NvidiaDevicePlugin{
devs: devs,
realDevNames: devList,
devNameMap: devNameMap,
socket: serverSock,
mps: mps,
healthCheck: healthCheck,
disableCGPUIsolation: disableCGPUIsolation,
stop: make(chan struct{}),
health: make(chan *pluginapi.Device),
queryKubelet: queryKubelet,
kubeletClient: client,
}, nil
}
func (m *NvidiaDevicePlugin) GetDeviceNameByIndex(index uint) (name string, found bool) {
if len(m.devIndxMap) == 0 {
m.devIndxMap = map[uint]string{}
for k, v := range m.devNameMap {
m.devIndxMap[v] = k
}
log.Infof("Get devIndexMap: %v", m.devIndxMap)
}
name, found = m.devIndxMap[index]
return name, found
}
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
return &pluginapi.DevicePluginOptions{}, nil
}
// dial establishes the gRPC communication with the registered device plugin.
func dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithTimeout(timeout),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if err != nil {
return nil, err
}
return c, nil
}
// Start starts the gRPC server of the device plugin
func (m *NvidiaDevicePlugin) Start() error {
err := m.cleanup()
if err != nil {
return err
}
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
go m.server.Serve(sock)
// Wait for server to start by launching a blocking connexion
conn, err := dial(m.socket, 5*time.Second)
if err != nil {
return err
}
conn.Close()
go m.healthcheck()
lastAllocateTime = time.Now()
return nil
}
// Stop stops the gRPC server
func (m *NvidiaDevicePlugin) Stop() error {
if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
close(m.stop)
return m.cleanup()
}
// Register registers the device plugin for the given resourceName with Kubelet.
func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
conn, err := dial(kubeletEndpoint, 5*time.Second)
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: path.Base(m.socket),
ResourceName: resourceName,
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return err
}
return nil
}
// ListAndWatch lists devices and update that list according to the health status
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
for {
select {
case <-m.stop:
return nil
case d := <-m.health:
// FIXME: there is no way to recover from the Unhealthy state.
d.Health = pluginapi.Unhealthy
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
}
}
}
func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
m.health <- dev
}
func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
return &pluginapi.PreStartContainerResponse{}, nil
}
func (m *NvidiaDevicePlugin) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
func (m *NvidiaDevicePlugin) healthcheck() {
ctx, cancel := context.WithCancel(context.Background())
var xids chan *pluginapi.Device
if m.healthCheck {
xids = make(chan *pluginapi.Device)
go watchXIDs(ctx, m.devs, xids)
}
for {
select {
case <-m.stop:
cancel()
return
case dev := <-xids:
m.unhealthy(dev)
}
}
}
// Serve starts the gRPC server and register the device plugin to Kubelet
func (m *NvidiaDevicePlugin) Serve() error {
err := m.Start()
if err != nil {
log.Infof("Could not start device plugin: %s", err)
return err
}
log.Infoln("Starting to serve on", m.socket)
err = m.Register(pluginapi.KubeletSocket, resourceName)
if err != nil {
log.Infof("Could not register device plugin: %s", err)
m.Stop()
return err
}
log.Infoln("Registered device plugin with Kubelet")
return nil
}

View File

@@ -0,0 +1,32 @@
package nvidia
import (
"os"
"os/signal"
"github.com/fsnotify/fsnotify"
)
func newFSWatcher(files ...string) (*fsnotify.Watcher, error) {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return nil, err
}
for _, f := range files {
err = watcher.Add(f)
if err != nil {
watcher.Close()
return nil, err
}
}
return watcher, nil
}
func newOSWatcher(sigs ...os.Signal) chan os.Signal {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, sigs...)
return sigChan
}

View File

@@ -0,0 +1,134 @@
package client
import (
"encoding/json"
"fmt"
"io"
v1 "k8s.io/api/core/v1"
utilnet "k8s.io/apimachinery/pkg/util/net"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/transport"
"net/http"
"time"
)
// KubeletClientConfig defines config parameters for the kubelet client
type KubeletClientConfig struct {
// Address specifies the kubelet address
Address string
// Port specifies the default port - used if no information about Kubelet port can be found in Node.NodeStatus.DaemonEndpoints.
Port uint
// TLSClientConfig contains settings to enable transport layer security
restclient.TLSClientConfig
// Server requires Bearer authentication
BearerToken string
// HTTPTimeout is used by the client to timeout http requests to Kubelet.
HTTPTimeout time.Duration
}
type KubeletClient struct {
defaultPort uint
host string
client *http.Client
}
func NewKubeletClient(config *KubeletClientConfig) (*KubeletClient, error) {
trans, err := makeTransport(config, true)
if err != nil {
return nil, err
}
client := &http.Client{
Transport: trans,
Timeout: config.HTTPTimeout,
}
return &KubeletClient{
host: config.Address,
defaultPort: config.Port,
client: client,
}, nil
}
// transportConfig converts a client config to an appropriate transport config.
func (c *KubeletClientConfig) transportConfig() *transport.Config {
cfg := &transport.Config{
TLS: transport.TLSConfig{
CAFile: c.CAFile,
CAData: c.CAData,
CertFile: c.CertFile,
CertData: c.CertData,
KeyFile: c.KeyFile,
KeyData: c.KeyData,
},
BearerToken: c.BearerToken,
}
if !cfg.HasCA() {
cfg.TLS.Insecure = true
}
return cfg
}
// makeTransport creates a RoundTripper for HTTP Transport.
func makeTransport(config *KubeletClientConfig, insecureSkipTLSVerify bool) (http.RoundTripper, error) {
// do the insecureSkipTLSVerify on the pre-transport *before* we go get a potentially cached connection.
// transportConfig always produces a new struct pointer.
preTLSConfig := config.transportConfig()
if insecureSkipTLSVerify && preTLSConfig != nil {
preTLSConfig.TLS.Insecure = true
preTLSConfig.TLS.CAData = nil
preTLSConfig.TLS.CAFile = ""
}
tlsConfig, err := transport.TLSConfigFor(preTLSConfig)
if err != nil {
return nil, err
}
rt := http.DefaultTransport
if tlsConfig != nil {
// If SSH Tunnel is turned on
rt = utilnet.SetOldTransportDefaults(&http.Transport{
TLSClientConfig: tlsConfig,
})
}
return transport.HTTPWrappersForConfig(config.transportConfig(), rt)
}
func ReadAll(r io.Reader) ([]byte, error) {
b := make([]byte, 0, 512)
for {
if len(b) == cap(b) {
// Add more capacity (let append pick how much).
b = append(b, 0)[:len(b)]
}
n, err := r.Read(b[len(b):cap(b)])
b = b[:len(b)+n]
if err != nil {
if err == io.EOF {
err = nil
}
return b, err
}
}
}
func (k *KubeletClient) GetNodeRunningPods() (*v1.PodList, error) {
resp, err := k.client.Get(fmt.Sprintf("https://%v:%d/pods/", k.host, k.defaultPort))
if err != nil {
return nil, err
}
body, err := ReadAll(resp.Body)
if err != nil {
return nil, err
}
podLists := &v1.PodList{}
if err = json.Unmarshal(body, &podLists); err != nil {
return nil, err
}
return podLists, err
}

View File

@@ -0,0 +1,57 @@
package client
import (
"flag"
"fmt"
"io/ioutil"
"k8s.io/client-go/rest"
"testing"
"time"
)
var (
clientCert string
clientKey string
token string
timeout int
)
func TestNewKubeletClient(t *testing.T) {
flag.StringVar(&clientCert, "client-cert", "", "")
flag.StringVar(&clientKey, "client-key", "", "")
flag.StringVar(&token, "token", "", "")
flag.IntVar(&timeout, "timeout", 10, "")
flag.Parse()
if clientCert == "" && clientKey == "" && token == "" {
tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
if err != nil {
panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err))
}
token = string(tokenByte)
}
c, err := NewKubeletClient(&KubeletClientConfig{
Address: "127.0.0.1",
Port: 10250,
TLSClientConfig: rest.TLSClientConfig{
Insecure: true,
ServerName: "kubelet",
CertFile: clientCert,
KeyFile: clientKey,
},
BearerToken: token,
HTTPTimeout: time.Duration(timeout) * time.Second,
})
if err != nil {
fmt.Println(err)
return
}
podsList, err := c.GetNodeRunningPods()
if err != nil {
fmt.Println(err)
return
}
fmt.Println(podsList)
}