synchronization

2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions
--- a/gpushare-scheduler-extender/deployer/README.md
+++ b/gpushare-scheduler-extender/deployer/README.md
@@ -0,0 +1,61 @@
+## Install GPU Sharing with helm charts in Alibaba Cloud Kubernetes Service
+
+## Requirements:
+
+* Kubernetes >= 1.11, kubectl >= 1.12
+
+* You'd better to choose [Alibaba Cloud Kubernetes Service](https://www.alibabacloud.com/product/kubernetes). The solution is only for the dedicated Kubernetes Cluster.
+
+## Steps:
+
+1.Just run:
+
+```
+git clone https://github.com/AliyunContainerService/gpushare-scheduler-extender.git
+cd gpushare-scheduler-extender/deployer/chart
+helm install --name gpushare --namespace kube-system  --set masterCount=3 gpushare-installer
+```
+
+
+2.Add gpushare node labels to the nodes requiring GPU sharing
+
+```bash
+kubectl label node <target_node> gpushare=true
+```
+
+For example:
+
+```bash
+kubectl label no mynode gpushare=true
+```
+
+3.Install Kubectl extension
+
+4.Install kubectl 1.12 or above
+You can download and install `kubectl` for linux
+
+```bash
+curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.12.1/bin/linux/amd64/kubectl
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/bin/kubectl
+```
+
+5.Download and install the kubectl extension
+
+```bash
+cd /usr/bin/
+wget https://github.com/AliyunContainerService/gpushare-device-plugin/releases/download/v0.3.0/kubectl-inspect-gpushare
+chmod u+x /usr/bin/kubectl-inspect-gpushare
+```
+
+6.Disable the gpushare node 
+
+```bash
+kubectl label node <target_node> gpushare=false
+```
+
+For example:
+
+```bash
+kubectl label no mynode gpushare=false
+```
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/.helmignore
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/.helmignore
@@ -0,0 +1,21 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/Chart.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+appVersion: "1.0"
+description: A Helm chart for Kubernetes
+name: gpushare-installer
+version: 0.7.0
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/changelog.md
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/changelog.md
@@ -0,0 +1,27 @@
+### 0.1.0
+
+* support gpushare deployment
+
+### 0.2.0
+
+* fix not recover gpu exclusive scheduling after removing gpushare 
+
+### 0.3.0
+
+* support helm v3
+
+### 0.4.0
+
+* delete env kubeVersion
+
+### 0.5.0
+
+* change mount dir of host to /etc/kubernetes
+
+### 0.6.0
+
+* change statefulset to job
+
+### 0.7.0
+
+* Support unhealthy configmap
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/_helpers.tpl
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/_helpers.tpl
@@ -0,0 +1,32 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "gpushare-installer.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "gpushare-installer.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "gpushare-installer.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-evictor.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-evictor.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: device-plugin-evict-ds
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      component: gpushare-device-plugin
+      app: gpushare
+      name: device-plugin-evict-ds
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        component: gpushare-device-plugin
+        app: gpushare
+        name: device-plugin-evict-ds
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        gpushare: "true"
+      containers:
+      - image: "{{ .Values.images.evictor.image }}:{{ .Values.images.evictor.tag }}"
+        imagePullPolicy: {{ .Values.images.evictor.pullPolicy }}
+        command: 
+          - bash
+          - /dp-evict/dp-evict-on-host.sh
+        name: gpushare
+        # Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: kube-dir
+          mountPath: /etc/kubernetes
+      volumes:
+      - hostPath:
+          path: /etc/kubernetes
+          type: Directory
+        name: kube-dir
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-recover.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-recover.yaml
@@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: device-plugin-recover-ds
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      component: gpushare-device-plugin
+      app: gpushare
+      name: device-plugin-recover-ds
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        component: gpushare-device-plugin
+        app: gpushare
+        name: device-plugin-recover-ds
+    spec:
+      nodeSelector:
+        gpushare: "false"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: aliyun.accelerator/nvidia_count
+                operator: Exists
+      # nodeSelector:
+      #   gpu-instance: "true"
+      hostNetwork: true
+      containers:
+      - image: "{{ .Values.images.recover.image }}:{{ .Values.images.recover.tag }}"
+        imagePullPolicy: {{ .Values.images.recover.pullPolicy }}
+        command: 
+          - bash 
+          - /dp-evict/dp-recover-on-host.sh
+        name: gpushare
+        # Make this pod as Guaranteed pod which will never be recovered because of node's resource consumption.
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: kube-dir
+          mountPath: /etc/kubernetes
+      volumes:
+      - hostPath:
+          path: /etc/kubernetes
+          type: Directory
+        name: kube-dir
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-ds.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-ds.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gpushare-device-plugin-ds
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      component: gpushare-device-plugin
+      app: gpushare
+      name: gpushare-device-plugin-ds
+      type: runtime
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        component: gpushare-device-plugin
+        app: gpushare
+        name: gpushare-device-plugin-ds
+        type: runtime
+    spec:
+      serviceAccount: gpushare-device-plugin
+      hostNetwork: true
+      nodeSelector:
+        gpushare: "true"
+      containers:
+      - image: "{{ .Values.images.devicePlugin.image }}:{{ .Values.images.devicePlugin.tag }}"
+        imagePullPolicy: {{ .Values.images.devicePlugin.pullPolicy }}
+        name: gpushare
+        # Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
+        command:
+          - gpushare-device-plugin-v2
+          - -logtostderr
+          - --v=5
+          - --memory-unit=GiB
+        resources:
+          limits:
+            memory: "300Mi"
+            cpu: "1"
+          requests:
+            memory: "300Mi"
+            cpu: "1"
+        env:
+        - name: KUBECONFIG
+          value: /etc/kubernetes/kubelet.conf
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-rbac.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-rbac.yaml
@@ -0,0 +1,59 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-device-plugin
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - update
+  - patch
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - nodes/status
+  verbs:
+  - patch
+  - update
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpushare-device-plugin
+  namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-device-plugin
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: gpushare-device-plugin
+subjects:
+- kind: ServiceAccount
+  name: gpushare-device-plugin
+  namespace: kube-system
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-deployment.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-deployment.yaml
@@ -0,0 +1,45 @@
+# deployment yaml
+---
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: gpushare
+      component: gpushare-schd-extender
+      type: runtime
+  replicas: 1
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: gpushare
+        component: gpushare-schd-extender
+        type: runtime
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ''
+    spec:
+      hostNetwork: true
+      tolerations:
+      - effect: NoSchedule
+        operator: Exists
+        key: node-role.kubernetes.io/master
+      - effect: NoSchedule
+        operator: Exists
+        key: node.cloudprovider.kubernetes.io/uninitialized
+      nodeSelector:
+         node-role.kubernetes.io/master: ""
+      serviceAccount: gpushare-schd-extender
+      containers:
+        - name: gpushare-schd-extender
+          image: "{{ .Values.images.extender.image }}:{{ .Values.images.extender.tag }}"
+          imagePullPolicy: {{ .Values.images.extender.pullPolicy }}
+          env:
+          - name: LOG_LEVEL
+            value: debug
+          - name: PORT
+            value: "12345"
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-rbac.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-rbac.yaml
@@ -0,0 +1,67 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-schd-extender
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - update
+  - patch
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - bindings
+  - pods/binding
+  verbs:
+  - create
+- apiGroups:
+  - ""
+  resources:
+  - configmaps
+  verbs:
+  - get
+  - list
+  - watch
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: gpushare-schd-extender
+subjects:
+- kind: ServiceAccount
+  name: gpushare-schd-extender
+  namespace: kube-system
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-service.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+  labels:
+    app: gpushare
+    component: gpushare-schd-extender
+spec:
+  # type: ClusterIP
+  type: NodePort
+  ports:
+  - port: 12345
+    name: http
+    targetPort: 12345
+    nodePort: 32766
+  selector:
+    app: gpushare
+    component: gpushare-schd-extender
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/schd-config-job.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/schd-config-job.yaml
@@ -0,0 +1,66 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: gpushare-installer
+  namespace: kube-system
+  labels:
+    app: gpushare
+    name: gpushare-installer
+    chart: {{ template "gpushare-installer.chart" . }}
+    release: {{ .Release.Name }}
+    heritage: {{ .Release.Service }}
+spec:
+  parallelism: {{ .Values.masterCount }}
+  template:
+    metadata:
+      labels: 
+        chart: {{ template "gpushare-installer.chart" . }}
+        release: {{ .Release.Name }}
+        heritage: {{ .Release.Service }}
+        app: gpushare
+        name: gpushare-installer
+    spec:
+      hostNetwork: true
+      tolerations:
+      - effect: NoSchedule
+        operator: Exists
+        key: node-role.kubernetes.io/master
+      - effect: NoSchedule
+        operator: Exists
+        key: node.cloudprovider.kubernetes.io/uninitialized
+      nodeSelector:
+         node-role.kubernetes.io/master: ""
+      restartPolicy: OnFailure
+      containers:
+      - name: deploy-schd
+        image: "{{ .Values.images.installer.image }}:{{ .Values.images.installer.tag }}"
+        imagePullPolicy: {{ .Values.images.installer.pullPolicy }}
+        securityContext:
+          privileged: true
+        command: 
+          - bash
+          - /schd-extender/install-sched-extender-on-host.sh
+        env:
+          - name: NODE_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+        volumeMounts:
+        - name: kube-dir
+          mountPath: /etc/kubernetes
+      volumes:
+      - hostPath:
+          path: /etc/kubernetes
+          type: Directory
+        name: kube-dir
+
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                  - key: name
+                    operator: In
+                    values:
+                       - gpushare-installer
+              topologyKey: "kubernetes.io/hostname"
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/values.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/values.yaml
@@ -0,0 +1,29 @@
+# Default values for gpushare-installer.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+masterCount: 3
+
+images:
+   extender:
+      image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-schd-extender"
+      tag: v1.0.0-ce6f800-aliyun
+      pullPolicy: IfNotPresent
+   installer:
+      image: "registry.cn-beijing.aliyuncs.com/acs/schd-extender-deployer"
+      tag: v1.0.0-b56d26d-aliyun
+      pullPolicy: IfNotPresent
+   devicePlugin:
+      image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-plugin"
+      tag: v1.0.0-2656995-aliyun
+      pullPolicy: IfNotPresent
+   evictor:
+      image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-evict"
+      tag: v1.0.0-b56d26d-aliyun
+      pullPolicy: IfNotPresent
+   recover:
+      image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-recover"
+      tag: v1.0.0-b56d26d-aliyun
+      pullPolicy: IfNotPresent
+
+
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/Dockerfile
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/Dockerfile
@@ -0,0 +1,12 @@
+FROM debian:bullseye-slim
+RUN echo \
+    deb [arch=amd64]  http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
+    > /etc/apt/sources.list
+RUN apt-get update
+RUN apt-get install -y curl tzdata iproute2 bash &&  \
+ 	rm -rf /var/cache/apt/* && \
+ 	cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
+ 	echo "Asia/Shanghai" >  /etc/timezone && \
+ 	mkdir -p /dp-evict
+ADD dp-evict /dp-evict
+RUN chmod -R +x /dp-evict
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/build.sh
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/build.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -xe
+
+BASEDIR=$(dirname "$0")
+echo "build docker images in $BASEDIR"
+
+TIMESTAMP=$(date +%Y%m%d%H%M)
+
+cd $BASEDIR
+
+# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
+docker build --network=host -t registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP .
+
+docker push registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP
+
+echo registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-evict:$TIMESTAMP
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/dp-evict/dp-evict-on-host.sh
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/dp-evict/dp-evict-on-host.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+set -e -x
+
+backup_dir="/etc/kubernetes/manifests-backup"
+
+public::common::log() {
+	echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
+}
+
+public::evict::gpu-device-plugin() {
+	dir=/etc/kubernetes/manifests/
+
+	if [  -f /etc/kubernetes/manifests/nvidia-device-plugin.yml ]; then
+			backup_dir="/etc/kubernetes/manifests-backup/"
+			mkdir -p $backup_dir
+			mv /etc/kubernetes/manifests/nvidia-device-plugin.yml $backup_dir
+		else
+			public::common::log "Skip removing nvidia-device-plugin.yml, because it doesn't exist."
+	fi
+}
+
+main() {
+
+	public::evict::gpu-device-plugin
+
+	touch /ready
+	while sleep 3600; do :; done
+}
+
+main "$@"
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/dp-evict/dp-evict.sh
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-evict/dp-evict/dp-evict.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -xe
+
+if [ -d "/k8s-host" ]; then
+	rm -rf /k8s-host/usr/local/dp-evict
+	mkdir -p /k8s-host/usr/local/dp-evict
+	cp -r /dp-evict/* /k8s-host/usr/local/dp-evict
+	chmod -R +x /k8s-host/usr/local/dp-evict
+	chroot /k8s-host /usr/local/dp-evict/dp-evict-on-host.sh "$@"
+	while sleep 3600; do :; done
+fi
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/Dockerfile
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/Dockerfile
@@ -0,0 +1,12 @@
+FROM debian:bullseye-slim
+RUN echo \
+    deb [arch=amd64]  http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
+    > /etc/apt/sources.list
+RUN apt-get update
+RUN apt-get install -y curl tzdata iproute2 bash &&  \
+ 	rm -rf /var/cache/apt/* && \
+ 	cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
+ 	echo "Asia/Shanghai" >  /etc/timezone && \
+ 	mkdir -p /dp-evict
+ADD dp-evict /dp-evict
+RUN chmod -R +x /dp-evict
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/build.sh
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/build.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -xe
+
+BASEDIR=$(dirname "$0")
+echo "build docker images in $BASEDIR"
+
+TIMESTAMP=$(date +%Y%m%d%H%M)
+
+cd $BASEDIR
+
+# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
+docker build --network=host -t registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP .
+
+docker push registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP
+
+echo registry.cn-hangzhou.aliyuncs.com/acs/nvidia-device-plugin-recover:$TIMESTAMP
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/dp-evict/dp-recover-on-host.sh
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/dp-evict/dp-recover-on-host.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+set -e -x
+
+dir="/etc/kubernetes/manifests"
+backup_dir="/etc/kubernetes/manifests-backup"
+
+public::common::log() {
+	echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
+}
+
+public::recover::gpu-device-plugin() {
+
+    if [  -f $dir/nvidia-device-plugin.yml ]; then
+		public::common::log "Skip recovering nvidia-device-plugin.yml, because it already exist."
+    else
+    	if [  -f $backup_dir/nvidia-device-plugin.yml ]; then
+			cp -f $backup_dir/nvidia-device-plugin.yml $dir/nvidia-device-plugin.yml
+			public::common::log "Finish recovering nvidia-device-plugin.yml."
+		else
+			public::common::log "No nvidia-device-plugin.yml to recover."
+		fi	
+	fi
+}
+
+main() {
+
+	public::recover::gpu-device-plugin
+
+	touch /ready
+	while sleep 3600; do :; done
+}
+
+main "$@"
--- a/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/dp-evict/dp-recover.sh
+++ b/gpushare-scheduler-extender/deployer/docker/device-plugin-recover/dp-evict/dp-recover.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -xe
+
+if [ -d "/k8s-host" ]; then
+	rm -rf /k8s-host/usr/local/dp-evict
+	mkdir -p /k8s-host/usr/local/dp-evict
+	cp -r /dp-evict/* /k8s-host/usr/local/dp-evict
+	chmod -R +x /k8s-host/usr/local/dp-evict
+	chroot /k8s-host /usr/local/dp-evict/dp-recover-on-host.sh "$@"
+	while sleep 3600; do :; done
+fi
--- a/gpushare-scheduler-extender/deployer/docker/schd-extender/Dockerfile
+++ b/gpushare-scheduler-extender/deployer/docker/schd-extender/Dockerfile
@@ -0,0 +1,14 @@
+FROM debian:bullseye-slim
+RUN echo \
+    deb [arch=amd64]  http://mirrors.aliyun.com/debian/ bullseye main non-free contrib\
+    > /etc/apt/sources.list
+RUN apt-get update 
+RUN apt-get install -y curl tzdata iproute2 bash &&  \
+ 	rm -rf /var/cache/apt/* && \
+ 	cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
+ 	echo "Asia/Shanghai" >  /etc/timezone && \
+ 	mkdir -p /schd-extender
+
+ADD schd-extender /schd-extender
+
+RUN chmod -R +x /schd-extender
--- a/gpushare-scheduler-extender/deployer/docker/schd-extender/build.sh
+++ b/gpushare-scheduler-extender/deployer/docker/schd-extender/build.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -xe
+
+BASEDIR=$(dirname "$0")
+echo "build docker images in $BASEDIR"
+
+TIMESTAMP=$(date +%Y%m%d%H%M)
+
+cd $BASEDIR
+
+# docker build --no-cache -t $IMAGE -f $FILE $BASEDIR
+docker build -t registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP .
+
+docker tag registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP cheyang/schd-extender-deployer:$TIMESTAMP
+
+docker push registry.cn-hangzhou.aliyuncs.com/acs/schd-extender-deployer:$TIMESTAMP
+
--- a/gpushare-scheduler-extender/deployer/docker/schd-extender/schd-extender/install-sched-extender-on-host.sh
+++ b/gpushare-scheduler-extender/deployer/docker/schd-extender/schd-extender/install-sched-extender-on-host.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+set -e -x
+
+dir=/etc/kubernetes/manifests
+
+backup_dir="/etc/kubernetes/manifests-backup"
+
+TIMESTAMP=$(date +%Y%m%d%H%M%S)
+
+public::common::log() {
+	echo $(date +"[%Y%m%d %H:%M:%S]: ") $1
+}
+
+public::deployer::sche-policy-config() {
+	
+	mkdir -p $backup_dir
+
+    if [ ! -f $backup_dir/kube-scheduler.ori.yaml ];then
+        cp /etc/kubernetes/manifests/kube-scheduler.yaml $backup_dir/kube-scheduler.ori.yaml
+	    public::common::log "Backup $backup_dir/kube-scheduler.ori.yaml"
+    else
+	    cp /etc/kubernetes/manifests/kube-scheduler.yaml $backup_dir/kube-scheduler-$TIMESTAMP.yaml
+	    public::common::log "Backup $backup_dir/kube-scheduler-$TIMESTAMP.yaml"
+    fi
+
+    if [ ! -f $backup_dir/scheduler-policy-config.ori.json ];then
+        if [ -f /etc/kubernetes/scheduler-policy-config.json ];then
+            cp /etc/kubernetes/scheduler-policy-config.json $backup_dir/scheduler-policy-config.ori.json
+            public::common::log "Backup $backup_dir/scheduler-policy-config.ori.json"
+        fi
+    else
+        if [ -f /etc/kubernetes/scheduler-policy-config.json ];then
+            cp /etc/kubernetes/scheduler-policy-config.json $backup_dir/scheduler-policy-config-$TIMESTAMP.json
+            public::common::log "Backup $backup_dir/scheduler-policy-config-$TIMESTAMP.json"
+        fi
+    fi
+
+	public::common::log "Configure shceduler extender"
+	cp -f /schd-extender/scheduler-policy-config.json /etc/kubernetes/scheduler-policy-config.json
+    sed -i 's/127.0.0.1/'"${NODE_IP}"'/g' /etc/kubernetes/scheduler-policy-config.json
+    if ! grep 'deployment.kubernetes.io/revision' $dir/kube-scheduler.yaml; then
+        sed -i '/scheduler.alpha.kubernetes.io\/critical-pod/a \    deployment.kubernetes.io/revision: "'"${TIMESTAMP}"'"' $dir/kube-scheduler.yaml
+    else
+        # sed -i '/deployment.kubernetes.io\/revision/d' $dir/kube-scheduler.yaml
+        sed -i 's#deployment.kubernetes.io/revision:.*#deployment.kubernetes.io/revision: "'"${TIMESTAMP}"'"#' $dir/kube-scheduler.yaml
+    fi
+    
+	if ! grep 'policy-config-file=/etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
+		sed -i "/- kube-scheduler/a\ \ \ \ - --policy-config-file=/etc/kubernetes/scheduler-policy-config.json" $dir/kube-scheduler.yaml
+	else
+		public::common::log "Skip the kube-scheduler config, because it's already configured extender."
+	fi
+	# add scheduler config policy volumeMounts
+	if ! grep 'mountPath: /etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
+		sed -i "/  volumeMounts:/a\ \ \ \ - mountPath: /etc/kubernetes/scheduler-policy-config.json\n      name: scheduler-policy-config\n      readOnly: true" $dir/kube-scheduler.yaml
+	else
+		public::common::log "Skip the scheduler-policy-config mountPath, because it's already configured extender."
+	fi
+	# add scheduler config policy volumes
+	if ! grep 'path: /etc/kubernetes/scheduler-policy-config.json' $dir/kube-scheduler.yaml; then
+		sed -i "/  volumes:/a \  - hostPath:\n      path: /etc/kubernetes/scheduler-policy-config.json\n      type: FileOrCreate\n    name: scheduler-policy-config" $dir/kube-scheduler.yaml
+	else
+		public::common::log "Skip the scheduler-policy-config volumes, because it's already configured extender."
+	fi
+}
+
+main() {
+	public::deployer::sche-policy-config
+
+	touch /ready
+	#while sleep 3600; do :; done
+}
+
+main
--- a/gpushare-scheduler-extender/deployer/docker/schd-extender/schd-extender/install-sched-extender.sh
+++ b/gpushare-scheduler-extender/deployer/docker/schd-extender/schd-extender/install-sched-extender.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -xe
+
+if [ -d "/k8s-host" ]; then
+	rm -rf /k8s-host/usr/local/k8s-schd-extender
+	mkdir -p /k8s-host/usr/local/k8s-schd-extender
+	cp -r /schd-extender/* /k8s-host/usr/local/k8s-schd-extender
+	chmod -R +x /k8s-host/usr/local/k8s-schd-extender/
+	chroot /k8s-host /usr/local/k8s-schd-extender/install-sched-extender-on-host.sh
+	while sleep 3600; do :; done
+fi
--- a/gpushare-scheduler-extender/deployer/docker/schd-extender/schd-extender/scheduler-policy-config.json
+++ b/gpushare-scheduler-extender/deployer/docker/schd-extender/schd-extender/scheduler-policy-config.json
@@ -0,0 +1,20 @@
+{
+  "kind": "Policy",
+  "apiVersion": "v1",
+  "extenders": [
+    {
+      "urlPrefix": "http://127.0.0.1:32766/gpushare-scheduler",
+      "filterVerb": "filter",
+      "bindVerb":   "bind",
+      "enableHttps": false,
+      "nodeCacheCapable": true,
+      "managedResources": [
+        {
+          "name": "aliyun.com/gpu-mem",
+          "ignoredByScheduler": false
+        }
+      ],
+      "ignorable": false
+    }
+  ]
+}