synchronization
This commit is contained in:
255
gpushare-device-plugin/cmd/inspect/display.go
Normal file
255
gpushare-device-plugin/cmd/inspect/display.go
Normal file
@@ -0,0 +1,255 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"text/tabwriter"
|
||||
|
||||
log "github.com/golang/glog"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
)
|
||||
|
||||
func displayDetails(nodeInfos []*NodeInfo) {
|
||||
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
|
||||
var (
|
||||
totalGPUMemInCluster int64
|
||||
usedGPUMemInCluster int64
|
||||
prtLineLen int
|
||||
)
|
||||
|
||||
for _, nodeInfo := range nodeInfos {
|
||||
address := "unknown"
|
||||
if len(nodeInfo.node.Status.Addresses) > 0 {
|
||||
//address = nodeInfo.node.Status.Addresses[0].Address
|
||||
for _, addr := range nodeInfo.node.Status.Addresses {
|
||||
if addr.Type == v1.NodeInternalIP {
|
||||
address = addr.Address
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
totalGPUMemInNode := nodeInfo.gpuTotalMemory
|
||||
if totalGPUMemInNode <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "\n")
|
||||
fmt.Fprintf(w, "NAME:\t%s\n", nodeInfo.node.Name)
|
||||
fmt.Fprintf(w, "IPADDRESS:\t%s\n", address)
|
||||
fmt.Fprintf(w, "\n")
|
||||
|
||||
usedGPUMemInNode := 0
|
||||
var buf bytes.Buffer
|
||||
buf.WriteString("NAME\tNAMESPACE\t")
|
||||
for i := 0; i < nodeInfo.gpuCount; i++ {
|
||||
buf.WriteString(fmt.Sprintf("GPU%d(Allocated)\t", i))
|
||||
}
|
||||
|
||||
if nodeInfo.hasPendingGPUMemory() {
|
||||
buf.WriteString("Pending(Allocated)\t")
|
||||
}
|
||||
buf.WriteString("\n")
|
||||
fmt.Fprintf(w, buf.String())
|
||||
|
||||
var buffer bytes.Buffer
|
||||
exists := map[types.UID]bool{}
|
||||
for i, dev := range nodeInfo.devs {
|
||||
usedGPUMemInNode += dev.usedGPUMem
|
||||
for _, pod := range dev.pods {
|
||||
if _,ok := exists[pod.UID]; ok {
|
||||
continue
|
||||
}
|
||||
buffer.WriteString(fmt.Sprintf("%s\t%s\t", pod.Name, pod.Namespace))
|
||||
count := nodeInfo.gpuCount
|
||||
if nodeInfo.hasPendingGPUMemory() {
|
||||
count += 1
|
||||
}
|
||||
|
||||
for k := 0; k < count; k++ {
|
||||
allocation := GetAllocation(&pod)
|
||||
if len(allocation) != 0 {
|
||||
buffer.WriteString(fmt.Sprintf("%d\t", allocation[k]))
|
||||
continue
|
||||
}
|
||||
if k == i || (i == -1 && k == nodeInfo.gpuCount) {
|
||||
buffer.WriteString(fmt.Sprintf("%d\t", getGPUMemoryInPod(pod)))
|
||||
} else {
|
||||
buffer.WriteString("0\t")
|
||||
}
|
||||
}
|
||||
buffer.WriteString("\n")
|
||||
exists[pod.UID] = true
|
||||
}
|
||||
}
|
||||
if prtLineLen == 0 {
|
||||
prtLineLen = buffer.Len() + 10
|
||||
}
|
||||
fmt.Fprintf(w, buffer.String())
|
||||
|
||||
var gpuUsageInNode float64 = 0
|
||||
if totalGPUMemInNode > 0 {
|
||||
gpuUsageInNode = float64(usedGPUMemInNode) / float64(totalGPUMemInNode) * 100
|
||||
} else {
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "Allocated :\t%d (%d%%)\t\n", usedGPUMemInNode, int64(gpuUsageInNode))
|
||||
fmt.Fprintf(w, "Total :\t%d \t\n", nodeInfo.gpuTotalMemory)
|
||||
// fmt.Fprintf(w, "-----------------------------------------------------------------------------------------\n")
|
||||
var prtLine bytes.Buffer
|
||||
for i := 0; i < prtLineLen; i++ {
|
||||
prtLine.WriteString("-")
|
||||
}
|
||||
prtLine.WriteString("\n")
|
||||
fmt.Fprintf(w, prtLine.String())
|
||||
totalGPUMemInCluster += int64(totalGPUMemInNode)
|
||||
usedGPUMemInCluster += int64(usedGPUMemInNode)
|
||||
}
|
||||
fmt.Fprintf(w, "\n")
|
||||
fmt.Fprintf(w, "\n")
|
||||
fmt.Fprintf(w, "Allocated/Total GPU Memory In Cluster:\t")
|
||||
log.V(2).Infof("gpu: %s, allocated GPU Memory %s", strconv.FormatInt(totalGPUMemInCluster, 10),
|
||||
strconv.FormatInt(usedGPUMemInCluster, 10))
|
||||
|
||||
var gpuUsage float64 = 0
|
||||
if totalGPUMemInCluster > 0 {
|
||||
gpuUsage = float64(usedGPUMemInCluster) / float64(totalGPUMemInCluster) * 100
|
||||
}
|
||||
fmt.Fprintf(w, "%s/%s (%d%%)\t\n",
|
||||
strconv.FormatInt(usedGPUMemInCluster, 10),
|
||||
strconv.FormatInt(totalGPUMemInCluster, 10),
|
||||
int64(gpuUsage))
|
||||
// fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", ...)
|
||||
|
||||
_ = w.Flush()
|
||||
}
|
||||
|
||||
func getMaxGPUCount(nodeInfos []*NodeInfo) (max int) {
|
||||
for _, node := range nodeInfos {
|
||||
if node.gpuCount > max {
|
||||
max = node.gpuCount
|
||||
}
|
||||
}
|
||||
|
||||
return max
|
||||
}
|
||||
|
||||
func displaySummary(nodeInfos []*NodeInfo) {
|
||||
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
|
||||
var (
|
||||
maxGPUCount int
|
||||
totalGPUMemInCluster int64
|
||||
usedGPUMemInCluster int64
|
||||
prtLineLen int
|
||||
)
|
||||
|
||||
hasPendingGPU := hasPendingGPUMemory(nodeInfos)
|
||||
|
||||
maxGPUCount = getMaxGPUCount(nodeInfos)
|
||||
|
||||
var buffer bytes.Buffer
|
||||
buffer.WriteString("NAME\tIPADDRESS\t")
|
||||
for i := 0; i < maxGPUCount; i++ {
|
||||
buffer.WriteString(fmt.Sprintf("GPU%d(Allocated/Total)\t", i))
|
||||
}
|
||||
|
||||
if hasPendingGPU {
|
||||
buffer.WriteString("PENDING(Allocated)\t")
|
||||
}
|
||||
buffer.WriteString(fmt.Sprintf("GPU Memory(%s)\n", memoryUnit))
|
||||
|
||||
// fmt.Fprintf(w, "NAME\tIPADDRESS\tROLE\tGPU(Allocated/Total)\tPENDING(Allocated)\n")
|
||||
fmt.Fprintf(w, buffer.String())
|
||||
for _, nodeInfo := range nodeInfos {
|
||||
address := "unknown"
|
||||
if len(nodeInfo.node.Status.Addresses) > 0 {
|
||||
// address = nodeInfo.node.Status.Addresses[0].Address
|
||||
for _, addr := range nodeInfo.node.Status.Addresses {
|
||||
if addr.Type == v1.NodeInternalIP {
|
||||
address = addr.Address
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gpuMemInfos := []string{}
|
||||
pendingGPUMemInfo := ""
|
||||
usedGPUMemInNode := 0
|
||||
totalGPUMemInNode := nodeInfo.gpuTotalMemory
|
||||
if totalGPUMemInNode <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
for i := 0; i < maxGPUCount; i++ {
|
||||
gpuMemInfo := "0/0"
|
||||
if dev, ok := nodeInfo.devs[i]; ok {
|
||||
gpuMemInfo = dev.String()
|
||||
usedGPUMemInNode += dev.usedGPUMem
|
||||
}
|
||||
gpuMemInfos = append(gpuMemInfos, gpuMemInfo)
|
||||
}
|
||||
|
||||
// check if there is pending dev
|
||||
if dev, ok := nodeInfo.devs[-1]; ok {
|
||||
pendingGPUMemInfo = fmt.Sprintf("%d", dev.usedGPUMem)
|
||||
usedGPUMemInNode += dev.usedGPUMem
|
||||
}
|
||||
|
||||
nodeGPUMemInfo := fmt.Sprintf("%d/%d", usedGPUMemInNode, totalGPUMemInNode)
|
||||
|
||||
var buf bytes.Buffer
|
||||
buf.WriteString(fmt.Sprintf("%s\t%s\t", nodeInfo.node.Name, address))
|
||||
for i := 0; i < maxGPUCount; i++ {
|
||||
buf.WriteString(fmt.Sprintf("%s\t", gpuMemInfos[i]))
|
||||
}
|
||||
if hasPendingGPU {
|
||||
buf.WriteString(fmt.Sprintf("%s\t", pendingGPUMemInfo))
|
||||
}
|
||||
|
||||
buf.WriteString(fmt.Sprintf("%s\n", nodeGPUMemInfo))
|
||||
fmt.Fprintf(w, buf.String())
|
||||
|
||||
if prtLineLen == 0 {
|
||||
prtLineLen = buf.Len() + 20
|
||||
}
|
||||
|
||||
usedGPUMemInCluster += int64(usedGPUMemInNode)
|
||||
totalGPUMemInCluster += int64(totalGPUMemInNode)
|
||||
}
|
||||
// fmt.Fprintf(w, "-----------------------------------------------------------------------------------------\n")
|
||||
var prtLine bytes.Buffer
|
||||
for i := 0; i < prtLineLen; i++ {
|
||||
prtLine.WriteString("-")
|
||||
}
|
||||
prtLine.WriteString("\n")
|
||||
fmt.Fprint(w, prtLine.String())
|
||||
|
||||
fmt.Fprintf(w, "Allocated/Total GPU Memory In Cluster:\n")
|
||||
log.V(2).Infof("gpu: %s, allocated GPU Memory %s", strconv.FormatInt(totalGPUMemInCluster, 10),
|
||||
strconv.FormatInt(usedGPUMemInCluster, 10))
|
||||
var gpuUsage float64 = 0
|
||||
if totalGPUMemInCluster > 0 {
|
||||
gpuUsage = float64(usedGPUMemInCluster) / float64(totalGPUMemInCluster) * 100
|
||||
}
|
||||
fmt.Fprintf(w, "%s/%s (%d%%)\t\n",
|
||||
strconv.FormatInt(usedGPUMemInCluster, 10),
|
||||
strconv.FormatInt(totalGPUMemInCluster, 10),
|
||||
int64(gpuUsage))
|
||||
// fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", ...)
|
||||
|
||||
_ = w.Flush()
|
||||
}
|
||||
|
||||
func getGPUMemoryInPod(pod v1.Pod) int {
|
||||
gpuMem := 0
|
||||
for _, container := range pod.Spec.Containers {
|
||||
if val, ok := container.Resources.Limits[resourceName]; ok {
|
||||
gpuMem += int(val.Value())
|
||||
}
|
||||
}
|
||||
return gpuMem
|
||||
}
|
74
gpushare-device-plugin/cmd/inspect/main.go
Normal file
74
gpushare-device-plugin/cmd/inspect/main.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
const (
|
||||
resourceName = "rainbond.com/gpu-mem"
|
||||
countName = "rainbond.com/gpu-count"
|
||||
gpuCountKey = "aliyun.accelerator/nvidia_count"
|
||||
cardNameKey = "aliyun.accelerator/nvidia_name"
|
||||
gpuMemKey = "aliyun.accelerator/nvidia_mem"
|
||||
pluginComponentKey = "component"
|
||||
pluginComponentValue = "gpushare-device-plugin"
|
||||
|
||||
envNVGPUID = "ALIYUN_COM_GPU_MEM_IDX"
|
||||
envPodGPUMemory = "ALIYUN_COM_GPU_MEM_POD"
|
||||
envTOTALGPUMEMORY = "ALIYUN_COM_GPU_MEM_DEV"
|
||||
gpushareAllocationFlag = "scheduler.framework.gpushare.allocation"
|
||||
)
|
||||
|
||||
func init() {
|
||||
kubeInit()
|
||||
// checkpointInit()
|
||||
}
|
||||
|
||||
func main() {
|
||||
var nodeName string
|
||||
// nodeName := flag.String("nodeName", "", "nodeName")
|
||||
details := flag.Bool("d", false, "details")
|
||||
flag.Parse()
|
||||
|
||||
args := flag.Args()
|
||||
if len(args) > 0 {
|
||||
nodeName = args[0]
|
||||
}
|
||||
|
||||
var pods []v1.Pod
|
||||
var nodes []v1.Node
|
||||
var err error
|
||||
|
||||
if nodeName == "" {
|
||||
nodes, err = getAllSharedGPUNode()
|
||||
if err == nil {
|
||||
pods, err = getActivePodsInAllNodes()
|
||||
}
|
||||
} else {
|
||||
nodes, err = getNodes(nodeName)
|
||||
if err == nil {
|
||||
pods, err = getActivePodsByNode(nodeName)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
fmt.Printf("Failed due to %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
nodeInfos, err := buildAllNodeInfos(pods, nodes)
|
||||
if err != nil {
|
||||
fmt.Printf("Failed due to %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
if *details {
|
||||
displayDetails(nodeInfos)
|
||||
} else {
|
||||
displaySummary(nodeInfos)
|
||||
}
|
||||
|
||||
}
|
271
gpushare-device-plugin/cmd/inspect/nodeinfo.go
Normal file
271
gpushare-device-plugin/cmd/inspect/nodeinfo.go
Normal file
@@ -0,0 +1,271 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
log "github.com/golang/glog"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
type DeviceInfo struct {
|
||||
idx int
|
||||
pods []v1.Pod
|
||||
usedGPUMem int
|
||||
totalGPUMem int
|
||||
node v1.Node
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) String() string {
|
||||
if d.idx == -1 {
|
||||
return fmt.Sprintf("%d", d.usedGPUMem)
|
||||
}
|
||||
return fmt.Sprintf("%d/%d", d.usedGPUMem, d.totalGPUMem)
|
||||
}
|
||||
|
||||
func (d *DeviceInfo) addGPUPod(pod v1.Pod) {
|
||||
if len(d.pods) == 0 {
|
||||
d.pods = []v1.Pod{}
|
||||
}
|
||||
d.pods = append(d.pods, pod)
|
||||
}
|
||||
|
||||
type NodeInfo struct {
|
||||
pods []v1.Pod
|
||||
node v1.Node
|
||||
devs map[int]*DeviceInfo
|
||||
gpuCount int
|
||||
gpuTotalMemory int
|
||||
pluginPod v1.Pod
|
||||
}
|
||||
|
||||
// The key function
|
||||
func buildAllNodeInfos(allPods []v1.Pod, nodes []v1.Node) ([]*NodeInfo, error) {
|
||||
nodeInfos := buildNodeInfoWithPods(allPods, nodes)
|
||||
for _, info := range nodeInfos {
|
||||
if info.gpuTotalMemory > 0 {
|
||||
setUnit(info.gpuTotalMemory, info.gpuCount)
|
||||
err := info.buildDeviceInfo()
|
||||
if err != nil {
|
||||
log.Warningf("Failed due to %v", err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
return nodeInfos, nil
|
||||
}
|
||||
|
||||
func (n *NodeInfo) acquirePluginPod() v1.Pod {
|
||||
if n.pluginPod.Name == "" {
|
||||
for _, pod := range n.pods {
|
||||
if val, ok := pod.Labels[pluginComponentKey]; ok {
|
||||
if val == pluginComponentValue {
|
||||
n.pluginPod = pod
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return n.pluginPod
|
||||
}
|
||||
|
||||
func getTotalGPUMemory(node v1.Node) int {
|
||||
val, ok := node.Status.Allocatable[resourceName]
|
||||
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
return int(val.Value())
|
||||
}
|
||||
|
||||
func getGPUCountInNode(node v1.Node) int {
|
||||
val, ok := node.Status.Allocatable[countName]
|
||||
|
||||
if !ok {
|
||||
return int(0)
|
||||
}
|
||||
|
||||
return int(val.Value())
|
||||
}
|
||||
|
||||
func buildNodeInfoWithPods(pods []v1.Pod, nodes []v1.Node) []*NodeInfo {
|
||||
nodeMap := map[string]*NodeInfo{}
|
||||
nodeList := []*NodeInfo{}
|
||||
|
||||
for _, node := range nodes {
|
||||
var info *NodeInfo = &NodeInfo{}
|
||||
if value, ok := nodeMap[node.Name]; ok {
|
||||
info = value
|
||||
} else {
|
||||
nodeMap[node.Name] = info
|
||||
info.node = node
|
||||
info.pods = []v1.Pod{}
|
||||
info.gpuCount = getGPUCountInNode(node)
|
||||
info.gpuTotalMemory = getTotalGPUMemory(node)
|
||||
info.devs = map[int]*DeviceInfo{}
|
||||
|
||||
for i := 0; i < info.gpuCount; i++ {
|
||||
dev := &DeviceInfo{
|
||||
pods: []v1.Pod{},
|
||||
idx: i,
|
||||
totalGPUMem: info.gpuTotalMemory / info.gpuCount,
|
||||
node: info.node,
|
||||
}
|
||||
info.devs[i] = dev
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.Spec.NodeName == node.Name {
|
||||
info.pods = append(info.pods, pod)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range nodeMap {
|
||||
nodeList = append(nodeList, v)
|
||||
}
|
||||
return nodeList
|
||||
}
|
||||
|
||||
func (n *NodeInfo) hasPendingGPUMemory() bool {
|
||||
_, found := n.devs[-1]
|
||||
return found
|
||||
}
|
||||
|
||||
// Get used GPUs in checkpoint
|
||||
func (n *NodeInfo) buildDeviceInfo() error {
|
||||
totalGPUMem := 0
|
||||
if n.gpuCount > 0 {
|
||||
totalGPUMem = n.gpuTotalMemory / n.gpuCount
|
||||
}
|
||||
GPUSearchLoop:
|
||||
for _, pod := range n.pods {
|
||||
if gpuMemoryInPod(pod) <= 0 {
|
||||
continue GPUSearchLoop
|
||||
}
|
||||
for devID, usedGPUMem := range n.getDeivceInfo(pod) {
|
||||
if n.devs[devID] == nil {
|
||||
n.devs[devID] = &DeviceInfo{
|
||||
pods: []v1.Pod{},
|
||||
idx: devID,
|
||||
totalGPUMem: totalGPUMem,
|
||||
node: n.node,
|
||||
}
|
||||
}
|
||||
n.devs[devID].usedGPUMem += usedGPUMem
|
||||
n.devs[devID].pods = append(n.devs[devID].pods, pod)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n *NodeInfo) getDeivceInfo(pod v1.Pod) map[int]int {
|
||||
var err error
|
||||
id := -1
|
||||
allocation := map[int]int{}
|
||||
allocation = GetAllocation(&pod)
|
||||
if len(allocation) != 0 {
|
||||
return allocation
|
||||
}
|
||||
if len(pod.ObjectMeta.Annotations) > 0 {
|
||||
value, found := pod.ObjectMeta.Annotations[envNVGPUID]
|
||||
if found {
|
||||
id, err = strconv.Atoi(value)
|
||||
if err != nil {
|
||||
log.Warningf("Failed to parse dev id %s due to %v for pod %s in ns %s",
|
||||
value,
|
||||
err,
|
||||
pod.Name,
|
||||
pod.Namespace)
|
||||
id = -1
|
||||
}
|
||||
} else {
|
||||
log.Warningf("Failed to get dev id %s for pod %s in ns %s",
|
||||
pod.Name,
|
||||
pod.Namespace)
|
||||
}
|
||||
}
|
||||
allocation[id] = gpuMemoryInPod(pod)
|
||||
return allocation
|
||||
}
|
||||
|
||||
func hasPendingGPUMemory(nodeInfos []*NodeInfo) (found bool) {
|
||||
for _, info := range nodeInfos {
|
||||
if info.hasPendingGPUMemory() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func getNodes(nodeName string) ([]v1.Node, error) {
|
||||
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
|
||||
return []v1.Node{*node}, err
|
||||
}
|
||||
|
||||
func isGPUSharingNode(node v1.Node) bool {
|
||||
value, ok := node.Status.Allocatable[resourceName]
|
||||
|
||||
if ok {
|
||||
ok = (int(value.Value()) > 0)
|
||||
}
|
||||
|
||||
return ok
|
||||
}
|
||||
|
||||
var (
|
||||
memoryUnit = ""
|
||||
)
|
||||
|
||||
func setUnit(gpuMemory, gpuCount int) {
|
||||
if memoryUnit != "" {
|
||||
return
|
||||
}
|
||||
|
||||
if gpuCount == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
gpuMemoryByDev := gpuMemory / gpuCount
|
||||
|
||||
if gpuMemoryByDev > 100 {
|
||||
memoryUnit = "MiB"
|
||||
} else {
|
||||
memoryUnit = "GiB"
|
||||
}
|
||||
}
|
||||
func GetAllocation(pod *v1.Pod) map[int]int {
|
||||
podGPUMems := map[int]int{}
|
||||
allocationString := ""
|
||||
if pod.ObjectMeta.Annotations == nil {
|
||||
return podGPUMems
|
||||
}
|
||||
value, ok := pod.ObjectMeta.Annotations[gpushareAllocationFlag]
|
||||
if !ok {
|
||||
return podGPUMems
|
||||
}
|
||||
allocationString = value
|
||||
var allocation map[int]map[string]int
|
||||
err := json.Unmarshal([]byte(allocationString), &allocation)
|
||||
if err != nil {
|
||||
return podGPUMems
|
||||
}
|
||||
for _, containerAllocation := range allocation {
|
||||
for id, gpuMem := range containerAllocation {
|
||||
gpuIndex, err := strconv.Atoi(id)
|
||||
if err != nil {
|
||||
log.Errorf("failed to get gpu memory from pod annotation,reason: %v", err)
|
||||
return map[int]int{}
|
||||
}
|
||||
podGPUMems[gpuIndex] += gpuMem
|
||||
}
|
||||
}
|
||||
return podGPUMems
|
||||
}
|
134
gpushare-device-plugin/cmd/inspect/podinfo.go
Normal file
134
gpushare-device-plugin/cmd/inspect/podinfo.go
Normal file
@@ -0,0 +1,134 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"time"
|
||||
|
||||
log "github.com/golang/glog"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/fields"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
)
|
||||
|
||||
var (
|
||||
clientConfig clientcmd.ClientConfig
|
||||
clientset *kubernetes.Clientset
|
||||
restConfig *rest.Config
|
||||
retries = 5
|
||||
)
|
||||
|
||||
func kubeInit() {
|
||||
|
||||
kubeconfigFile := os.Getenv("KUBECONFIG")
|
||||
if kubeconfigFile == "" {
|
||||
kubeconfigFile = path.Join(os.Getenv("HOME"), "/.kube/config")
|
||||
}
|
||||
if _, err := os.Stat(kubeconfigFile); err != nil {
|
||||
log.Fatalf("kubeconfig %s failed to find due to %v, please set KUBECONFIG env", kubeconfigFile, err)
|
||||
}
|
||||
|
||||
var err error
|
||||
restConfig, err = clientcmd.BuildConfigFromFlags("", kubeconfigFile)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed due to %v", err)
|
||||
}
|
||||
clientset, err = kubernetes.NewForConfig(restConfig)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed due to %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
type podInfo struct {
|
||||
name string
|
||||
namespace string
|
||||
}
|
||||
|
||||
func (p podInfo) equal(p1 podInfo) bool {
|
||||
return p.name == p1.name && p.namespace == p1.namespace
|
||||
}
|
||||
|
||||
func getActivePodsByNode(nodeName string) ([]v1.Pod, error) {
|
||||
selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName})
|
||||
pods, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
|
||||
FieldSelector: selector.String(),
|
||||
LabelSelector: labels.Everything().String(),
|
||||
})
|
||||
|
||||
for i := 0; i < retries && err != nil; i++ {
|
||||
pods, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
|
||||
FieldSelector: selector.String(),
|
||||
LabelSelector: labels.Everything().String(),
|
||||
})
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if err != nil {
|
||||
return []v1.Pod{}, fmt.Errorf("failed to get Pods in node %v", nodeName)
|
||||
}
|
||||
|
||||
return filterActivePods(pods.Items), nil
|
||||
}
|
||||
|
||||
func getActivePodsInAllNodes() ([]v1.Pod, error) {
|
||||
pods, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
|
||||
LabelSelector: labels.Everything().String(),
|
||||
})
|
||||
|
||||
for i := 0; i < retries && err != nil; i++ {
|
||||
pods, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
|
||||
LabelSelector: labels.Everything().String(),
|
||||
})
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if err != nil {
|
||||
return []v1.Pod{}, fmt.Errorf("failed to get Pods")
|
||||
}
|
||||
return filterActivePods(pods.Items), nil
|
||||
}
|
||||
|
||||
func filterActivePods(pods []v1.Pod) (activePods []v1.Pod) {
|
||||
activePods = []v1.Pod{}
|
||||
for _, pod := range pods {
|
||||
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
|
||||
continue
|
||||
}
|
||||
|
||||
activePods = append(activePods, pod)
|
||||
}
|
||||
|
||||
return activePods
|
||||
}
|
||||
|
||||
func getAllSharedGPUNode() ([]v1.Node, error) {
|
||||
nodes := []v1.Node{}
|
||||
allNodes, err := clientset.CoreV1().Nodes().List(metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nodes, err
|
||||
}
|
||||
|
||||
for _, item := range allNodes.Items {
|
||||
if isGPUSharingNode(item) {
|
||||
nodes = append(nodes, item)
|
||||
}
|
||||
}
|
||||
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
func gpuMemoryInPod(pod v1.Pod) int {
|
||||
var total int
|
||||
containers := pod.Spec.Containers
|
||||
for _, container := range containers {
|
||||
if val, ok := container.Resources.Limits[resourceName]; ok {
|
||||
total += int(val.Value())
|
||||
}
|
||||
}
|
||||
|
||||
return total
|
||||
}
|
78
gpushare-device-plugin/cmd/nvidia/main.go
Normal file
78
gpushare-device-plugin/cmd/nvidia/main.go
Normal file
@@ -0,0 +1,78 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"time"
|
||||
|
||||
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia"
|
||||
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
|
||||
log "github.com/golang/glog"
|
||||
"k8s.io/client-go/rest"
|
||||
)
|
||||
|
||||
var (
|
||||
mps = flag.Bool("mps", false, "Enable or Disable MPS")
|
||||
healthCheck = flag.Bool("health-check", false, "Enable or disable Health check")
|
||||
memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
|
||||
queryFromKubelet = flag.Bool("query-kubelet", false, "Query pending pods from kubelet instead of kube-apiserver")
|
||||
kubeletAddress = flag.String("kubelet-address", "0.0.0.0", "Kubelet IP Address")
|
||||
kubeletPort = flag.Uint("kubelet-port", 10250, "Kubelet listened Port")
|
||||
clientCert = flag.String("client-cert", "", "Kubelet TLS client certificate")
|
||||
clientKey = flag.String("client-key", "", "Kubelet TLS client key")
|
||||
token = flag.String("token", "", "Kubelet client bearer token")
|
||||
timeout = flag.Int("timeout", 10, "Kubelet client http timeout duration")
|
||||
)
|
||||
|
||||
func buildKubeletClient() *client.KubeletClient {
|
||||
if *clientCert == "" && *clientKey == "" && *token == "" {
|
||||
tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err))
|
||||
}
|
||||
tokenStr := string(tokenByte)
|
||||
token = &tokenStr
|
||||
}
|
||||
kubeletClient, err := client.NewKubeletClient(&client.KubeletClientConfig{
|
||||
Address: *kubeletAddress,
|
||||
Port: *kubeletPort,
|
||||
TLSClientConfig: rest.TLSClientConfig{
|
||||
Insecure: true,
|
||||
ServerName: "gpushare-device-plugin",
|
||||
CertFile: *clientCert,
|
||||
KeyFile: *clientKey,
|
||||
},
|
||||
BearerToken: *token,
|
||||
HTTPTimeout: time.Duration(*timeout) * time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return kubeletClient
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
log.V(1).Infoln("Start gpushare device plugin")
|
||||
|
||||
kubeletClient := buildKubeletClient()
|
||||
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *queryFromKubelet, translatememoryUnits(*memoryUnit), kubeletClient)
|
||||
err := ngm.Run()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed due to %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func translatememoryUnits(value string) nvidia.MemoryUnit {
|
||||
memoryUnit := nvidia.MemoryUnit(value)
|
||||
switch memoryUnit {
|
||||
case nvidia.MiBPrefix:
|
||||
case nvidia.GiBPrefix:
|
||||
default:
|
||||
log.Warningf("Unsupported memory unit: %s, use memoryUnit Gi as default", value)
|
||||
memoryUnit = nvidia.GiBPrefix
|
||||
}
|
||||
|
||||
return memoryUnit
|
||||
}
|
57
gpushare-device-plugin/cmd/podgetter/main.go
Normal file
57
gpushare-device-plugin/cmd/podgetter/main.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client"
|
||||
"io/ioutil"
|
||||
"k8s.io/client-go/rest"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
clientCert string
|
||||
clientKey string
|
||||
token string
|
||||
timeout int
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.StringVar(&clientCert, "client-cert", "", "")
|
||||
flag.StringVar(&clientKey, "client-key", "", "")
|
||||
flag.StringVar(&token, "token", "", "")
|
||||
flag.IntVar(&timeout, "timeout", 10, "")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if clientCert == "" && clientKey == "" && token == "" {
|
||||
tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err))
|
||||
}
|
||||
token = string(tokenByte)
|
||||
}
|
||||
|
||||
c, err := client.NewKubeletClient(&client.KubeletClientConfig{
|
||||
Address: "127.0.0.1",
|
||||
Port: 10250,
|
||||
TLSClientConfig: rest.TLSClientConfig{
|
||||
Insecure: true,
|
||||
ServerName: "kubelet",
|
||||
CertFile: clientCert,
|
||||
KeyFile: clientKey,
|
||||
},
|
||||
BearerToken: token,
|
||||
HTTPTimeout: time.Duration(timeout) * time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
podsList, err := c.GetNodeRunningPods()
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
fmt.Println(podsList)
|
||||
}
|
Reference in New Issue
Block a user