Files
Rainbond/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-recover.yaml
2025-08-25 16:04:00 +08:00

53 lines
1.5 KiB
YAML

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: device-plugin-recover-ds
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-recover-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-recover-ds
spec:
nodeSelector:
gpushare: "false"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aliyun.accelerator/nvidia_count
operator: Exists
# nodeSelector:
# gpu-instance: "true"
hostNetwork: true
containers:
- image: "{{ .Values.images.recover.image }}:{{ .Values.images.recover.tag }}"
imagePullPolicy: {{ .Values.images.recover.pullPolicy }}
command:
- bash
- /dp-evict/dp-recover-on-host.sh
name: gpushare
# Make this pod as Guaranteed pod which will never be recovered because of node's resource consumption.
securityContext:
privileged: true
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir