synchronization
This commit is contained in:
61
gpushare-scheduler-extender/deployer/README.md
Normal file
61
gpushare-scheduler-extender/deployer/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
## Install GPU Sharing with helm charts in Alibaba Cloud Kubernetes Service
|
||||
|
||||
## Requirements:
|
||||
|
||||
* Kubernetes >= 1.11, kubectl >= 1.12
|
||||
|
||||
* You'd better to choose [Alibaba Cloud Kubernetes Service](https://www.alibabacloud.com/product/kubernetes). The solution is only for the dedicated Kubernetes Cluster.
|
||||
|
||||
## Steps:
|
||||
|
||||
1.Just run:
|
||||
|
||||
```
|
||||
git clone https://github.com/AliyunContainerService/gpushare-scheduler-extender.git
|
||||
cd gpushare-scheduler-extender/deployer/chart
|
||||
helm install --name gpushare --namespace kube-system --set masterCount=3 gpushare-installer
|
||||
```
|
||||
|
||||
|
||||
2.Add gpushare node labels to the nodes requiring GPU sharing
|
||||
|
||||
```bash
|
||||
kubectl label node <target_node> gpushare=true
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```bash
|
||||
kubectl label no mynode gpushare=true
|
||||
```
|
||||
|
||||
3.Install Kubectl extension
|
||||
|
||||
4.Install kubectl 1.12 or above
|
||||
You can download and install `kubectl` for linux
|
||||
|
||||
```bash
|
||||
curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.12.1/bin/linux/amd64/kubectl
|
||||
chmod +x ./kubectl
|
||||
sudo mv ./kubectl /usr/bin/kubectl
|
||||
```
|
||||
|
||||
5.Download and install the kubectl extension
|
||||
|
||||
```bash
|
||||
cd /usr/bin/
|
||||
wget https://github.com/AliyunContainerService/gpushare-device-plugin/releases/download/v0.3.0/kubectl-inspect-gpushare
|
||||
chmod u+x /usr/bin/kubectl-inspect-gpushare
|
||||
```
|
||||
|
||||
6.Disable the gpushare node
|
||||
|
||||
```bash
|
||||
kubectl label node <target_node> gpushare=false
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```bash
|
||||
kubectl label no mynode gpushare=false
|
||||
```
|
@@ -0,0 +1,21 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
@@ -0,0 +1,5 @@
|
||||
apiVersion: v1
|
||||
appVersion: "1.0"
|
||||
description: A Helm chart for Kubernetes
|
||||
name: gpushare-installer
|
||||
version: 0.7.0
|
@@ -0,0 +1,27 @@
|
||||
### 0.1.0
|
||||
|
||||
* support gpushare deployment
|
||||
|
||||
### 0.2.0
|
||||
|
||||
* fix not recover gpu exclusive scheduling after removing gpushare
|
||||
|
||||
### 0.3.0
|
||||
|
||||
* support helm v3
|
||||
|
||||
### 0.4.0
|
||||
|
||||
* delete env kubeVersion
|
||||
|
||||
### 0.5.0
|
||||
|
||||
* change mount dir of host to /etc/kubernetes
|
||||
|
||||
### 0.6.0
|
||||
|
||||
* change statefulset to job
|
||||
|
||||
### 0.7.0
|
||||
|
||||
* Support unhealthy configmap
|
@@ -0,0 +1,32 @@
|
||||
{{/* vim: set filetype=mustache: */}}
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "gpushare-installer.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "gpushare-installer.fullname" -}}
|
||||
{{- if .Values.fullnameOverride -}}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride -}}
|
||||
{{- if contains $name .Release.Name -}}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "gpushare-installer.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
@@ -0,0 +1,43 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: device-plugin-evict-ds
|
||||
namespace: kube-system
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
selector:
|
||||
matchLabels:
|
||||
component: gpushare-device-plugin
|
||||
app: gpushare
|
||||
name: device-plugin-evict-ds
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ""
|
||||
labels:
|
||||
component: gpushare-device-plugin
|
||||
app: gpushare
|
||||
name: device-plugin-evict-ds
|
||||
spec:
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
gpushare: "true"
|
||||
containers:
|
||||
- image: "{{ .Values.images.evictor.image }}:{{ .Values.images.evictor.tag }}"
|
||||
imagePullPolicy: {{ .Values.images.evictor.pullPolicy }}
|
||||
command:
|
||||
- bash
|
||||
- /dp-evict/dp-evict-on-host.sh
|
||||
name: gpushare
|
||||
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- name: kube-dir
|
||||
mountPath: /etc/kubernetes
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /etc/kubernetes
|
||||
type: Directory
|
||||
name: kube-dir
|
@@ -0,0 +1,52 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: device-plugin-recover-ds
|
||||
namespace: kube-system
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
selector:
|
||||
matchLabels:
|
||||
component: gpushare-device-plugin
|
||||
app: gpushare
|
||||
name: device-plugin-recover-ds
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ""
|
||||
labels:
|
||||
component: gpushare-device-plugin
|
||||
app: gpushare
|
||||
name: device-plugin-recover-ds
|
||||
spec:
|
||||
nodeSelector:
|
||||
gpushare: "false"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: aliyun.accelerator/nvidia_count
|
||||
operator: Exists
|
||||
# nodeSelector:
|
||||
# gpu-instance: "true"
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- image: "{{ .Values.images.recover.image }}:{{ .Values.images.recover.tag }}"
|
||||
imagePullPolicy: {{ .Values.images.recover.pullPolicy }}
|
||||
command:
|
||||
- bash
|
||||
- /dp-evict/dp-recover-on-host.sh
|
||||
name: gpushare
|
||||
# Make this pod as Guaranteed pod which will never be recovered because of node's resource consumption.
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- name: kube-dir
|
||||
mountPath: /etc/kubernetes
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /etc/kubernetes
|
||||
type: Directory
|
||||
name: kube-dir
|
@@ -0,0 +1,61 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: gpushare-device-plugin-ds
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
component: gpushare-device-plugin
|
||||
app: gpushare
|
||||
name: gpushare-device-plugin-ds
|
||||
type: runtime
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ""
|
||||
labels:
|
||||
component: gpushare-device-plugin
|
||||
app: gpushare
|
||||
name: gpushare-device-plugin-ds
|
||||
type: runtime
|
||||
spec:
|
||||
serviceAccount: gpushare-device-plugin
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
gpushare: "true"
|
||||
containers:
|
||||
- image: "{{ .Values.images.devicePlugin.image }}:{{ .Values.images.devicePlugin.tag }}"
|
||||
imagePullPolicy: {{ .Values.images.devicePlugin.pullPolicy }}
|
||||
name: gpushare
|
||||
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
|
||||
command:
|
||||
- gpushare-device-plugin-v2
|
||||
- -logtostderr
|
||||
- --v=5
|
||||
- --memory-unit=GiB
|
||||
resources:
|
||||
limits:
|
||||
memory: "300Mi"
|
||||
cpu: "1"
|
||||
requests:
|
||||
memory: "300Mi"
|
||||
cpu: "1"
|
||||
env:
|
||||
- name: KUBECONFIG
|
||||
value: /etc/kubernetes/kubelet.conf
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
@@ -0,0 +1,59 @@
|
||||
# rbac.yaml
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: gpushare-device-plugin
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
verbs:
|
||||
- update
|
||||
- patch
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes/status
|
||||
verbs:
|
||||
- patch
|
||||
- update
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: gpushare-device-plugin
|
||||
namespace: kube-system
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: gpushare-device-plugin
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: gpushare-device-plugin
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: gpushare-device-plugin
|
||||
namespace: kube-system
|
@@ -0,0 +1,45 @@
|
||||
# deployment yaml
|
||||
---
|
||||
kind: Deployment
|
||||
apiVersion: apps/v1
|
||||
metadata:
|
||||
name: gpushare-schd-extender
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: gpushare
|
||||
component: gpushare-schd-extender
|
||||
type: runtime
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: gpushare
|
||||
component: gpushare-schd-extender
|
||||
type: runtime
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||
spec:
|
||||
hostNetwork: true
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
key: node-role.kubernetes.io/master
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
key: node.cloudprovider.kubernetes.io/uninitialized
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
serviceAccount: gpushare-schd-extender
|
||||
containers:
|
||||
- name: gpushare-schd-extender
|
||||
image: "{{ .Values.images.extender.image }}:{{ .Values.images.extender.tag }}"
|
||||
imagePullPolicy: {{ .Values.images.extender.pullPolicy }}
|
||||
env:
|
||||
- name: LOG_LEVEL
|
||||
value: debug
|
||||
- name: PORT
|
||||
value: "12345"
|
@@ -0,0 +1,67 @@
|
||||
# rbac.yaml
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: gpushare-schd-extender
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
verbs:
|
||||
- update
|
||||
- patch
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- bindings
|
||||
- pods/binding
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- configmaps
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: gpushare-schd-extender
|
||||
namespace: kube-system
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: gpushare-schd-extender
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: gpushare-schd-extender
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: gpushare-schd-extender
|
||||
namespace: kube-system
|
@@ -0,0 +1,19 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: gpushare-schd-extender
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: gpushare
|
||||
component: gpushare-schd-extender
|
||||
spec:
|
||||
# type: ClusterIP
|
||||
type: NodePort
|
||||
ports:
|
||||
- port: 12345
|
||||
name: http
|
||||
targetPort: 12345
|
||||
nodePort: 32766
|
||||
selector:
|
||||
app: gpushare
|
||||
component: gpushare-schd-extender
|
@@ -0,0 +1,66 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: gpushare-installer
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: gpushare
|
||||
name: gpushare-installer
|
||||
chart: {{ template "gpushare-installer.chart" . }}
|
||||
release: {{ .Release.Name }}
|
||||
heritage: {{ .Release.Service }}
|
||||
spec:
|
||||
parallelism: {{ .Values.masterCount }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
chart: {{ template "gpushare-installer.chart" . }}
|
||||
release: {{ .Release.Name }}
|
||||
heritage: {{ .Release.Service }}
|
||||
app: gpushare
|
||||
name: gpushare-installer
|
||||
spec:
|
||||
hostNetwork: true
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
key: node-role.kubernetes.io/master
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
key: node.cloudprovider.kubernetes.io/uninitialized
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: deploy-schd
|
||||
image: "{{ .Values.images.installer.image }}:{{ .Values.images.installer.tag }}"
|
||||
imagePullPolicy: {{ .Values.images.installer.pullPolicy }}
|
||||
securityContext:
|
||||
privileged: true
|
||||
command:
|
||||
- bash
|
||||
- /schd-extender/install-sched-extender-on-host.sh
|
||||
env:
|
||||
- name: NODE_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: status.hostIP
|
||||
volumeMounts:
|
||||
- name: kube-dir
|
||||
mountPath: /etc/kubernetes
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /etc/kubernetes
|
||||
type: Directory
|
||||
name: kube-dir
|
||||
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: name
|
||||
operator: In
|
||||
values:
|
||||
- gpushare-installer
|
||||
topologyKey: "kubernetes.io/hostname"
|
@@ -0,0 +1,29 @@
|
||||
# Default values for gpushare-installer.
|
||||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
|
||||
masterCount: 3
|
||||
|
||||
images:
|
||||
extender:
|
||||
image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-schd-extender"
|
||||
tag: v1.0.0-ce6f800-aliyun
|
||||
pullPolicy: IfNotPresent
|
||||
installer:
|
||||
image: "registry.cn-beijing.aliyuncs.com/acs/schd-extender-deployer"
|
||||
tag: v1.0.0-b56d26d-aliyun
|
||||
pullPolicy: IfNotPresent
|
||||
devicePlugin:
|
||||
image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-plugin"
|
||||
tag: v1.0.0-2656995-aliyun
|
||||
pullPolicy: IfNotPresent
|
||||
evictor:
|
||||
image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-evict"
|
||||
tag: v1.0.0-b56d26d-aliyun
|
||||
pullPolicy: IfNotPresent
|
||||
recover:
|
||||
image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-recover"
|
||||
tag: v1.0.0-b56d26d-aliyun
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
|
@@ -0,0 +1,12 @@
|
||||
FROM debian:bullseye-slim
|
||||
RUN echo \
|
||||
deb [arch=amd64] http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
|
||||
> /etc/apt/sources.list
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y curl tzdata iproute2 bash && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
|
||||
echo "Asia/Shanghai" > /etc/timezone && \
|
||||
mkdir -p /dp-evict
|
||||
ADD dp-evict /dp-evict
|
||||
RUN chmod -R +x /dp-evict
|
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "build docker images in $BASEDIR"
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d%H%M)
|
||||
|
||||
cd $BASEDIR
|
||||
|
||||
# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
|
||||
docker build --network=host -t registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP .
|
||||
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP
|
||||
|
||||
echo registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP
|
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
|
||||
backup_dir="/etc/kubernetes/manifests-backup"
|
||||
|
||||
public::common::log() {
|
||||
echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
|
||||
}
|
||||
|
||||
public::evict::gpu-device-plugin() {
|
||||
dir=/etc/kubernetes/manifests/
|
||||
|
||||
if [ -f /etc/kubernetes/manifests/nvidia-device-plugin.yml ]; then
|
||||
backup_dir="/etc/kubernetes/manifests-backup/"
|
||||
mkdir -p $backup_dir
|
||||
mv /etc/kubernetes/manifests/nvidia-device-plugin.yml $backup_dir
|
||||
else
|
||||
public::common::log "Skip removing nvidia-device-plugin.yml, because it doesn't exist."
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
public::evict::gpu-device-plugin
|
||||
|
||||
touch /ready
|
||||
while sleep 3600; do :; done
|
||||
}
|
||||
|
||||
main "$@"
|
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
if [ -d "/k8s-host" ]; then
|
||||
rm -rf /k8s-host/usr/local/dp-evict
|
||||
mkdir -p /k8s-host/usr/local/dp-evict
|
||||
cp -r /dp-evict/* /k8s-host/usr/local/dp-evict
|
||||
chmod -R +x /k8s-host/usr/local/dp-evict
|
||||
chroot /k8s-host /usr/local/dp-evict/dp-evict-on-host.sh "$@"
|
||||
while sleep 3600; do :; done
|
||||
fi
|
@@ -0,0 +1,12 @@
|
||||
FROM debian:bullseye-slim
|
||||
RUN echo \
|
||||
deb [arch=amd64] http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
|
||||
> /etc/apt/sources.list
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y curl tzdata iproute2 bash && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
|
||||
echo "Asia/Shanghai" > /etc/timezone && \
|
||||
mkdir -p /dp-evict
|
||||
ADD dp-evict /dp-evict
|
||||
RUN chmod -R +x /dp-evict
|
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "build docker images in $BASEDIR"
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d%H%M)
|
||||
|
||||
cd $BASEDIR
|
||||
|
||||
# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
|
||||
docker build --network=host -t registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP .
|
||||
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP
|
||||
|
||||
echo registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP
|
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
|
||||
dir="/etc/kubernetes/manifests"
|
||||
backup_dir="/etc/kubernetes/manifests-backup"
|
||||
|
||||
public::common::log() {
|
||||
echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
|
||||
}
|
||||
|
||||
public::recover::gpu-device-plugin() {
|
||||
|
||||
if [ -f $dir/nvidia-device-plugin.yml ]; then
|
||||
public::common::log "Skip recovering nvidia-device-plugin.yml, because it already exist."
|
||||
else
|
||||
if [ -f $backup_dir/nvidia-device-plugin.yml ]; then
|
||||
cp -f $backup_dir/nvidia-device-plugin.yml $dir/nvidia-device-plugin.yml
|
||||
public::common::log "Finish recovering nvidia-device-plugin.yml."
|
||||
else
|
||||
public::common::log "No nvidia-device-plugin.yml to recover."
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
public::recover::gpu-device-plugin
|
||||
|
||||
touch /ready
|
||||
while sleep 3600; do :; done
|
||||
}
|
||||
|
||||
main "$@"
|
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
if [ -d "/k8s-host" ]; then
|
||||
rm -rf /k8s-host/usr/local/dp-evict
|
||||
mkdir -p /k8s-host/usr/local/dp-evict
|
||||
cp -r /dp-evict/* /k8s-host/usr/local/dp-evict
|
||||
chmod -R +x /k8s-host/usr/local/dp-evict
|
||||
chroot /k8s-host /usr/local/dp-evict/dp-recover-on-host.sh "$@"
|
||||
while sleep 3600; do :; done
|
||||
fi
|
@@ -0,0 +1,14 @@
|
||||
FROM debian:bullseye-slim
|
||||
RUN echo \
|
||||
deb [arch=amd64] http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
|
||||
> /etc/apt/sources.list
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y curl tzdata iproute2 bash && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
|
||||
echo "Asia/Shanghai" > /etc/timezone && \
|
||||
mkdir -p /schd-extender
|
||||
|
||||
ADD schd-extender /schd-extender
|
||||
|
||||
RUN chmod -R +x /schd-extender
|
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "build docker images in $BASEDIR"
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d%H%M)
|
||||
|
||||
cd $BASEDIR
|
||||
|
||||
# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
|
||||
docker build -t registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP .
|
||||
|
||||
docker tag registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP cheyang/schd-extender-deployer:$TIMESTAMP
|
||||
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP
|
||||
|
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
|
||||
dir=/etc/kubernetes/manifests
|
||||
|
||||
backup_dir="/etc/kubernetes/manifests-backup"
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d%H%M%S)
|
||||
|
||||
public::common::log() {
|
||||
echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
|
||||
}
|
||||
|
||||
public::deployer::sche-policy-config() {
|
||||
|
||||
mkdir -p $backup_dir
|
||||
|
||||
if [ ! -f $backup_dir/kube-scheduler.ori.yaml ];then
|
||||
cp /etc/kubernetes/manifests/kube-scheduler.yaml $backup_dir/kube-scheduler.ori.yaml
|
||||
public::common::log "Backup $backup_dir/kube-scheduler.ori.yaml"
|
||||
else
|
||||
cp /etc/kubernetes/manifests/kube-scheduler.yaml $backup_dir/kube-scheduler-$TIMESTAMP.yaml
|
||||
public::common::log "Backup $backup_dir/kube-scheduler-$TIMESTAMP.yaml"
|
||||
fi
|
||||
|
||||
if [ ! -f $backup_dir/scheduler-policy-config.ori.json ];then
|
||||
if [ -f /etc/kubernetes/scheduler-policy-config.json ];then
|
||||
cp /etc/kubernetes/scheduler-policy-config.json $backup_dir/scheduler-policy-config.ori.json
|
||||
public::common::log "Backup $backup_dir/scheduler-policy-config.ori.json"
|
||||
fi
|
||||
else
|
||||
if [ -f /etc/kubernetes/scheduler-policy-config.json ];then
|
||||
cp /etc/kubernetes/scheduler-policy-config.json $backup_dir/scheduler-policy-config-$TIMESTAMP.json
|
||||
public::common::log "Backup $backup_dir/scheduler-policy-config-$TIMESTAMP.json"
|
||||
fi
|
||||
fi
|
||||
|
||||
public::common::log "Configure shceduler extender"
|
||||
cp -f /schd-extender/scheduler-policy-config.json /etc/kubernetes/scheduler-policy-config.json
|
||||
sed -i 's/127.0.0.1/'"${NODE_IP}"'/g' /etc/kubernetes/scheduler-policy-config.json
|
||||
if ! grep 'deployment.kubernetes.io/revision' $dir/kube-scheduler.yaml; then
|
||||
sed -i '/scheduler.alpha.kubernetes.io\/critical-pod/a \ deployment.kubernetes.io/revision: "'"${TIMESTAMP}"'"' $dir/kube-scheduler.yaml
|
||||
else
|
||||
# sed -i '/deployment.kubernetes.io\/revision/d' $dir/kube-scheduler.yaml
|
||||
sed -i 's#deployment.kubernetes.io/revision:.*#deployment.kubernetes.io/revision: "'"${TIMESTAMP}"'"#' $dir/kube-scheduler.yaml
|
||||
fi
|
||||
|
||||
if ! grep 'policy-config-file=/etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
|
||||
sed -i "/- kube-scheduler/a\ \ \ \ - --policy-config-file=/etc/kubernetes/scheduler-policy-config.json" $dir/kube-scheduler.yaml
|
||||
else
|
||||
public::common::log "Skip the kube-scheduler config, because it's already configured extender."
|
||||
fi
|
||||
# add scheduler config policy volumeMounts
|
||||
if ! grep 'mountPath: /etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
|
||||
sed -i "/ volumeMounts:/a\ \ \ \ - mountPath: /etc/kubernetes/scheduler-policy-config.json\n name: scheduler-policy-config\n readOnly: true" $dir/kube-scheduler.yaml
|
||||
else
|
||||
public::common::log "Skip the scheduler-policy-config mountPath, because it's already configured extender."
|
||||
fi
|
||||
# add scheduler config policy volumes
|
||||
if ! grep 'path: /etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
|
||||
sed -i "/ volumes:/a \ - hostPath:\n path: /etc/kubernetes/scheduler-policy-config.json\n type: FileOrCreate\n name: scheduler-policy-config" $dir/kube-scheduler.yaml
|
||||
else
|
||||
public::common::log "Skip the scheduler-policy-config volumes, because it's already configured extender."
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
public::deployer::sche-policy-config
|
||||
|
||||
touch /ready
|
||||
#while sleep 3600; do :; done
|
||||
}
|
||||
|
||||
main
|
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
if [ -d "/k8s-host" ]; then
|
||||
rm -rf /k8s-host/usr/local/k8s-schd-extender
|
||||
mkdir -p /k8s-host/usr/local/k8s-schd-extender
|
||||
cp -r /schd-extender/* /k8s-host/usr/local/k8s-schd-extender
|
||||
chmod -R +x /k8s-host/usr/local/k8s-schd-extender/
|
||||
chroot /k8s-host /usr/local/k8s-schd-extender/install-sched-extender-on-host.sh
|
||||
while sleep 3600; do :; done
|
||||
fi
|
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"kind": "Policy",
|
||||
"apiVersion": "v1",
|
||||
"extenders": [
|
||||
{
|
||||
"urlPrefix": "http://127.0.0.1:32766/gpushare-scheduler",
|
||||
"filterVerb": "filter",
|
||||
"bindVerb": "bind",
|
||||
"enableHttps": false,
|
||||
"nodeCacheCapable": true,
|
||||
"managedResources": [
|
||||
{
|
||||
"name": "aliyun.com/gpu-mem",
|
||||
"ignoredByScheduler": false
|
||||
}
|
||||
],
|
||||
"ignorable": false
|
||||
}
|
||||
]
|
||||
}
|
Reference in New Issue
Block a user