synchronization

This commit is contained in:
2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions

View File

@@ -0,0 +1,61 @@
## Install GPU Sharing with helm charts in Alibaba Cloud Kubernetes Service
## Requirements:
* Kubernetes >= 1.11, kubectl >= 1.12
* You'd better to choose [Alibaba Cloud Kubernetes Service](https://www.alibabacloud.com/product/kubernetes). The solution is only for the dedicated Kubernetes Cluster.
## Steps:
1.Just run:
```
git clone https://github.com/AliyunContainerService/gpushare-scheduler-extender.git
cd gpushare-scheduler-extender/deployer/chart
helm install --name gpushare --namespace kube-system --set masterCount=3 gpushare-installer
```
2.Add gpushare node labels to the nodes requiring GPU sharing
```bash
kubectl label node <target_node> gpushare=true
```
For example:
```bash
kubectl label no mynode gpushare=true
```
3.Install Kubectl extension
4.Install kubectl 1.12 or above
You can download and install `kubectl` for linux
```bash
curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.12.1/bin/linux/amd64/kubectl
chmod +x ./kubectl
sudo mv ./kubectl /usr/bin/kubectl
```
5.Download and install the kubectl extension
```bash
cd /usr/bin/
wget https://github.com/AliyunContainerService/gpushare-device-plugin/releases/download/v0.3.0/kubectl-inspect-gpushare
chmod u+x /usr/bin/kubectl-inspect-gpushare
```
6.Disable the gpushare node
```bash
kubectl label node <target_node> gpushare=false
```
For example:
```bash
kubectl label no mynode gpushare=false
```

View File

@@ -0,0 +1,21 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj

View File

@@ -0,0 +1,5 @@
apiVersion: v1
appVersion: "1.0"
description: A Helm chart for Kubernetes
name: gpushare-installer
version: 0.7.0

View File

@@ -0,0 +1,27 @@
### 0.1.0
* support gpushare deployment
### 0.2.0
* fix not recover gpu exclusive scheduling after removing gpushare
### 0.3.0
* support helm v3
### 0.4.0
* delete env kubeVersion
### 0.5.0
* change mount dir of host to /etc/kubernetes
### 0.6.0
* change statefulset to job
### 0.7.0
* Support unhealthy configmap

View File

@@ -0,0 +1,32 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "gpushare-installer.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "gpushare-installer.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "gpushare-installer.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}

View File

@@ -0,0 +1,43 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: device-plugin-evict-ds
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-evict-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-evict-ds
spec:
hostNetwork: true
nodeSelector:
gpushare: "true"
containers:
- image: "{{ .Values.images.evictor.image }}:{{ .Values.images.evictor.tag }}"
imagePullPolicy: {{ .Values.images.evictor.pullPolicy }}
command:
- bash
- /dp-evict/dp-evict-on-host.sh
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
securityContext:
privileged: true
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir

View File

@@ -0,0 +1,52 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: device-plugin-recover-ds
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-recover-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-recover-ds
spec:
nodeSelector:
gpushare: "false"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aliyun.accelerator/nvidia_count
operator: Exists
# nodeSelector:
# gpu-instance: "true"
hostNetwork: true
containers:
- image: "{{ .Values.images.recover.image }}:{{ .Values.images.recover.tag }}"
imagePullPolicy: {{ .Values.images.recover.pullPolicy }}
command:
- bash
- /dp-evict/dp-recover-on-host.sh
name: gpushare
# Make this pod as Guaranteed pod which will never be recovered because of node's resource consumption.
securityContext:
privileged: true
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir

View File

@@ -0,0 +1,61 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpushare-device-plugin-ds
namespace: kube-system
spec:
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: gpushare-device-plugin-ds
type: runtime
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: gpushare-device-plugin-ds
type: runtime
spec:
serviceAccount: gpushare-device-plugin
hostNetwork: true
nodeSelector:
gpushare: "true"
containers:
- image: "{{ .Values.images.devicePlugin.image }}:{{ .Values.images.devicePlugin.tag }}"
imagePullPolicy: {{ .Values.images.devicePlugin.pullPolicy }}
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
command:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
- --memory-unit=GiB
resources:
limits:
memory: "300Mi"
cpu: "1"
requests:
memory: "300Mi"
cpu: "1"
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins

View File

@@ -0,0 +1,59 @@
# rbac.yaml
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpushare-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: gpushare-device-plugin
subjects:
- kind: ServiceAccount
name: gpushare-device-plugin
namespace: kube-system

View File

@@ -0,0 +1,45 @@
# deployment yaml
---
kind: Deployment
apiVersion: apps/v1
metadata:
name: gpushare-schd-extender
namespace: kube-system
spec:
selector:
matchLabels:
app: gpushare
component: gpushare-schd-extender
type: runtime
replicas: 1
strategy:
type: Recreate
template:
metadata:
labels:
app: gpushare
component: gpushare-schd-extender
type: runtime
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
spec:
hostNetwork: true
tolerations:
- effect: NoSchedule
operator: Exists
key: node-role.kubernetes.io/master
- effect: NoSchedule
operator: Exists
key: node.cloudprovider.kubernetes.io/uninitialized
nodeSelector:
node-role.kubernetes.io/master: ""
serviceAccount: gpushare-schd-extender
containers:
- name: gpushare-schd-extender
image: "{{ .Values.images.extender.image }}:{{ .Values.images.extender.tag }}"
imagePullPolicy: {{ .Values.images.extender.pullPolicy }}
env:
- name: LOG_LEVEL
value: debug
- name: PORT
value: "12345"

View File

@@ -0,0 +1,67 @@
# rbac.yaml
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-schd-extender
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- bindings
- pods/binding
verbs:
- create
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- list
- watch
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpushare-schd-extender
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-schd-extender
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: gpushare-schd-extender
subjects:
- kind: ServiceAccount
name: gpushare-schd-extender
namespace: kube-system

View File

@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
name: gpushare-schd-extender
namespace: kube-system
labels:
app: gpushare
component: gpushare-schd-extender
spec:
# type: ClusterIP
type: NodePort
ports:
- port: 12345
name: http
targetPort: 12345
nodePort: 32766
selector:
app: gpushare
component: gpushare-schd-extender

View File

@@ -0,0 +1,66 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gpushare-installer
namespace: kube-system
labels:
app: gpushare
name: gpushare-installer
chart: {{ template "gpushare-installer.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
spec:
parallelism: {{ .Values.masterCount }}
template:
metadata:
labels:
chart: {{ template "gpushare-installer.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
app: gpushare
name: gpushare-installer
spec:
hostNetwork: true
tolerations:
- effect: NoSchedule
operator: Exists
key: node-role.kubernetes.io/master
- effect: NoSchedule
operator: Exists
key: node.cloudprovider.kubernetes.io/uninitialized
nodeSelector:
node-role.kubernetes.io/master: ""
restartPolicy: OnFailure
containers:
- name: deploy-schd
image: "{{ .Values.images.installer.image }}:{{ .Values.images.installer.tag }}"
imagePullPolicy: {{ .Values.images.installer.pullPolicy }}
securityContext:
privileged: true
command:
- bash
- /schd-extender/install-sched-extender-on-host.sh
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: name
operator: In
values:
- gpushare-installer
topologyKey: "kubernetes.io/hostname"

View File

@@ -0,0 +1,29 @@
# Default values for gpushare-installer.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
masterCount: 3
images:
extender:
image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-schd-extender"
tag: v1.0.0-ce6f800-aliyun
pullPolicy: IfNotPresent
installer:
image: "registry.cn-beijing.aliyuncs.com/acs/schd-extender-deployer"
tag: v1.0.0-b56d26d-aliyun
pullPolicy: IfNotPresent
devicePlugin:
image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-plugin"
tag: v1.0.0-2656995-aliyun
pullPolicy: IfNotPresent
evictor:
image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-evict"
tag: v1.0.0-b56d26d-aliyun
pullPolicy: IfNotPresent
recover:
image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-recover"
tag: v1.0.0-b56d26d-aliyun
pullPolicy: IfNotPresent

View File

@@ -0,0 +1,12 @@
FROM debian:bullseye-slim
RUN echo \
deb [arch=amd64] http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
> /etc/apt/sources.list
RUN apt-get update
RUN apt-get install -y curl tzdata iproute2 bash && \
rm -rf /var/cache/apt/* && \
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
echo "Asia/Shanghai" > /etc/timezone && \
mkdir -p /dp-evict
ADD dp-evict /dp-evict
RUN chmod -R +x /dp-evict

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -xe
BASEDIR=$(dirname "$0")
echo "build docker images in $BASEDIR"
TIMESTAMP=$(date +%Y%m%d%H%M)
cd $BASEDIR
# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
docker build --network=host -t registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP .
docker push registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP
echo registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP

View File

@@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -e -x
backup_dir="/etc/kubernetes/manifests-backup"
public::common::log() {
echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
}
public::evict::gpu-device-plugin() {
dir=/etc/kubernetes/manifests/
if [ -f /etc/kubernetes/manifests/nvidia-device-plugin.yml ]; then
backup_dir="/etc/kubernetes/manifests-backup/"
mkdir -p $backup_dir
mv /etc/kubernetes/manifests/nvidia-device-plugin.yml $backup_dir
else
public::common::log "Skip removing nvidia-device-plugin.yml, because it doesn't exist."
fi
}
main() {
public::evict::gpu-device-plugin
touch /ready
while sleep 3600; do :; done
}
main "$@"

View File

@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -xe
if [ -d "/k8s-host" ]; then
rm -rf /k8s-host/usr/local/dp-evict
mkdir -p /k8s-host/usr/local/dp-evict
cp -r /dp-evict/* /k8s-host/usr/local/dp-evict
chmod -R +x /k8s-host/usr/local/dp-evict
chroot /k8s-host /usr/local/dp-evict/dp-evict-on-host.sh "$@"
while sleep 3600; do :; done
fi

View File

@@ -0,0 +1,12 @@
FROM debian:bullseye-slim
RUN echo \
deb [arch=amd64] http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
> /etc/apt/sources.list
RUN apt-get update
RUN apt-get install -y curl tzdata iproute2 bash && \
rm -rf /var/cache/apt/* && \
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
echo "Asia/Shanghai" > /etc/timezone && \
mkdir -p /dp-evict
ADD dp-evict /dp-evict
RUN chmod -R +x /dp-evict

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -xe
BASEDIR=$(dirname "$0")
echo "build docker images in $BASEDIR"
TIMESTAMP=$(date +%Y%m%d%H%M)
cd $BASEDIR
# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
docker build --network=host -t registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP .
docker push registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP
echo registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -e -x
dir="/etc/kubernetes/manifests"
backup_dir="/etc/kubernetes/manifests-backup"
public::common::log() {
echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
}
public::recover::gpu-device-plugin() {
if [ -f $dir/nvidia-device-plugin.yml ]; then
public::common::log "Skip recovering nvidia-device-plugin.yml, because it already exist."
else
if [ -f $backup_dir/nvidia-device-plugin.yml ]; then
cp -f $backup_dir/nvidia-device-plugin.yml $dir/nvidia-device-plugin.yml
public::common::log "Finish recovering nvidia-device-plugin.yml."
else
public::common::log "No nvidia-device-plugin.yml to recover."
fi
fi
}
main() {
public::recover::gpu-device-plugin
touch /ready
while sleep 3600; do :; done
}
main "$@"

View File

@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -xe
if [ -d "/k8s-host" ]; then
rm -rf /k8s-host/usr/local/dp-evict
mkdir -p /k8s-host/usr/local/dp-evict
cp -r /dp-evict/* /k8s-host/usr/local/dp-evict
chmod -R +x /k8s-host/usr/local/dp-evict
chroot /k8s-host /usr/local/dp-evict/dp-recover-on-host.sh "$@"
while sleep 3600; do :; done
fi

View File

@@ -0,0 +1,14 @@
FROM debian:bullseye-slim
RUN echo \
deb [arch=amd64] http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
> /etc/apt/sources.list
RUN apt-get update
RUN apt-get install -y curl tzdata iproute2 bash && \
rm -rf /var/cache/apt/* && \
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
echo "Asia/Shanghai" > /etc/timezone && \
mkdir -p /schd-extender
ADD schd-extender /schd-extender
RUN chmod -R +x /schd-extender

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -xe
BASEDIR=$(dirname "$0")
echo "build docker images in $BASEDIR"
TIMESTAMP=$(date +%Y%m%d%H%M)
cd $BASEDIR
# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
docker build -t registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP .
docker tag registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP cheyang/schd-extender-deployer:$TIMESTAMP
docker push registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env bash
set -e -x
dir=/etc/kubernetes/manifests
backup_dir="/etc/kubernetes/manifests-backup"
TIMESTAMP=$(date +%Y%m%d%H%M%S)
public::common::log() {
echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
}
public::deployer::sche-policy-config() {
mkdir -p $backup_dir
if [ ! -f $backup_dir/kube-scheduler.ori.yaml ];then
cp /etc/kubernetes/manifests/kube-scheduler.yaml $backup_dir/kube-scheduler.ori.yaml
public::common::log "Backup $backup_dir/kube-scheduler.ori.yaml"
else
cp /etc/kubernetes/manifests/kube-scheduler.yaml $backup_dir/kube-scheduler-$TIMESTAMP.yaml
public::common::log "Backup $backup_dir/kube-scheduler-$TIMESTAMP.yaml"
fi
if [ ! -f $backup_dir/scheduler-policy-config.ori.json ];then
if [ -f /etc/kubernetes/scheduler-policy-config.json ];then
cp /etc/kubernetes/scheduler-policy-config.json $backup_dir/scheduler-policy-config.ori.json
public::common::log "Backup $backup_dir/scheduler-policy-config.ori.json"
fi
else
if [ -f /etc/kubernetes/scheduler-policy-config.json ];then
cp /etc/kubernetes/scheduler-policy-config.json $backup_dir/scheduler-policy-config-$TIMESTAMP.json
public::common::log "Backup $backup_dir/scheduler-policy-config-$TIMESTAMP.json"
fi
fi
public::common::log "Configure shceduler extender"
cp -f /schd-extender/scheduler-policy-config.json /etc/kubernetes/scheduler-policy-config.json
sed -i 's/127.0.0.1/'"${NODE_IP}"'/g' /etc/kubernetes/scheduler-policy-config.json
if ! grep 'deployment.kubernetes.io/revision' $dir/kube-scheduler.yaml; then
sed -i '/scheduler.alpha.kubernetes.io\/critical-pod/a \ deployment.kubernetes.io/revision: "'"${TIMESTAMP}"'"' $dir/kube-scheduler.yaml
else
# sed -i '/deployment.kubernetes.io\/revision/d' $dir/kube-scheduler.yaml
sed -i 's#deployment.kubernetes.io/revision:.*#deployment.kubernetes.io/revision: "'"${TIMESTAMP}"'"#' $dir/kube-scheduler.yaml
fi
if ! grep 'policy-config-file=/etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
sed -i "/- kube-scheduler/a\ \ \ \ - --policy-config-file=/etc/kubernetes/scheduler-policy-config.json" $dir/kube-scheduler.yaml
else
public::common::log "Skip the kube-scheduler config, because it's already configured extender."
fi
# add scheduler config policy volumeMounts
if ! grep 'mountPath: /etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
sed -i "/ volumeMounts:/a\ \ \ \ - mountPath: /etc/kubernetes/scheduler-policy-config.json\n name: scheduler-policy-config\n readOnly: true" $dir/kube-scheduler.yaml
else
public::common::log "Skip the scheduler-policy-config mountPath, because it's already configured extender."
fi
# add scheduler config policy volumes
if ! grep 'path: /etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
sed -i "/ volumes:/a \ - hostPath:\n path: /etc/kubernetes/scheduler-policy-config.json\n type: FileOrCreate\n name: scheduler-policy-config" $dir/kube-scheduler.yaml
else
public::common::log "Skip the scheduler-policy-config volumes, because it's already configured extender."
fi
}
main() {
public::deployer::sche-policy-config
touch /ready
#while sleep 3600; do :; done
}
main

View File

@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -xe
if [ -d "/k8s-host" ]; then
rm -rf /k8s-host/usr/local/k8s-schd-extender
mkdir -p /k8s-host/usr/local/k8s-schd-extender
cp -r /schd-extender/* /k8s-host/usr/local/k8s-schd-extender
chmod -R +x /k8s-host/usr/local/k8s-schd-extender/
chroot /k8s-host /usr/local/k8s-schd-extender/install-sched-extender-on-host.sh
while sleep 3600; do :; done
fi

View File

@@ -0,0 +1,20 @@
{
"kind": "Policy",
"apiVersion": "v1",
"extenders": [
{
"urlPrefix": "http://127.0.0.1:32766/gpushare-scheduler",
"filterVerb": "filter",
"bindVerb": "bind",
"enableHttps": false,
"nodeCacheCapable": true,
"managedResources": [
{
"name": "aliyun.com/gpu-mem",
"ignoredByScheduler": false
}
],
"ignorable": false
}
]
}