synchronization

2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/.helmignore
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/.helmignore
@@ -0,0 +1,21 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/Chart.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+appVersion: "1.0"
+description: A Helm chart for Kubernetes
+name: gpushare-installer
+version: 0.7.0
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/changelog.md
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/changelog.md
@@ -0,0 +1,27 @@
+### 0.1.0
+
+* support gpushare deployment
+
+### 0.2.0
+
+* fix not recover gpu exclusive scheduling after removing gpushare 
+
+### 0.3.0
+
+* support helm v3
+
+### 0.4.0
+
+* delete env kubeVersion
+
+### 0.5.0
+
+* change mount dir of host to /etc/kubernetes
+
+### 0.6.0
+
+* change statefulset to job
+
+### 0.7.0
+
+* Support unhealthy configmap
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/_helpers.tpl
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/_helpers.tpl
@@ -0,0 +1,32 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "gpushare-installer.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "gpushare-installer.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "gpushare-installer.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-evictor.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-evictor.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: device-plugin-evict-ds
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      component: gpushare-device-plugin
+      app: gpushare
+      name: device-plugin-evict-ds
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        component: gpushare-device-plugin
+        app: gpushare
+        name: device-plugin-evict-ds
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        gpushare: "true"
+      containers:
+      - image: "{{ .Values.images.evictor.image }}:{{ .Values.images.evictor.tag }}"
+        imagePullPolicy: {{ .Values.images.evictor.pullPolicy }}
+        command: 
+          - bash
+          - /dp-evict/dp-evict-on-host.sh
+        name: gpushare
+        # Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: kube-dir
+          mountPath: /etc/kubernetes
+      volumes:
+      - hostPath:
+          path: /etc/kubernetes
+          type: Directory
+        name: kube-dir
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-recover.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/device-plugin-recover.yaml
@@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: device-plugin-recover-ds
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      component: gpushare-device-plugin
+      app: gpushare
+      name: device-plugin-recover-ds
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        component: gpushare-device-plugin
+        app: gpushare
+        name: device-plugin-recover-ds
+    spec:
+      nodeSelector:
+        gpushare: "false"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: aliyun.accelerator/nvidia_count
+                operator: Exists
+      # nodeSelector:
+      #   gpu-instance: "true"
+      hostNetwork: true
+      containers:
+      - image: "{{ .Values.images.recover.image }}:{{ .Values.images.recover.tag }}"
+        imagePullPolicy: {{ .Values.images.recover.pullPolicy }}
+        command: 
+          - bash 
+          - /dp-evict/dp-recover-on-host.sh
+        name: gpushare
+        # Make this pod as Guaranteed pod which will never be recovered because of node's resource consumption.
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: kube-dir
+          mountPath: /etc/kubernetes
+      volumes:
+      - hostPath:
+          path: /etc/kubernetes
+          type: Directory
+        name: kube-dir
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-ds.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-ds.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gpushare-device-plugin-ds
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      component: gpushare-device-plugin
+      app: gpushare
+      name: gpushare-device-plugin-ds
+      type: runtime
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        component: gpushare-device-plugin
+        app: gpushare
+        name: gpushare-device-plugin-ds
+        type: runtime
+    spec:
+      serviceAccount: gpushare-device-plugin
+      hostNetwork: true
+      nodeSelector:
+        gpushare: "true"
+      containers:
+      - image: "{{ .Values.images.devicePlugin.image }}:{{ .Values.images.devicePlugin.tag }}"
+        imagePullPolicy: {{ .Values.images.devicePlugin.pullPolicy }}
+        name: gpushare
+        # Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
+        command:
+          - gpushare-device-plugin-v2
+          - -logtostderr
+          - --v=5
+          - --memory-unit=GiB
+        resources:
+          limits:
+            memory: "300Mi"
+            cpu: "1"
+          requests:
+            memory: "300Mi"
+            cpu: "1"
+        env:
+        - name: KUBECONFIG
+          value: /etc/kubernetes/kubelet.conf
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-rbac.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-device-plugin-rbac.yaml
@@ -0,0 +1,59 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-device-plugin
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - update
+  - patch
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - nodes/status
+  verbs:
+  - patch
+  - update
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpushare-device-plugin
+  namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-device-plugin
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: gpushare-device-plugin
+subjects:
+- kind: ServiceAccount
+  name: gpushare-device-plugin
+  namespace: kube-system
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-deployment.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-deployment.yaml
@@ -0,0 +1,45 @@
+# deployment yaml
+---
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: gpushare
+      component: gpushare-schd-extender
+      type: runtime
+  replicas: 1
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: gpushare
+        component: gpushare-schd-extender
+        type: runtime
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ''
+    spec:
+      hostNetwork: true
+      tolerations:
+      - effect: NoSchedule
+        operator: Exists
+        key: node-role.kubernetes.io/master
+      - effect: NoSchedule
+        operator: Exists
+        key: node.cloudprovider.kubernetes.io/uninitialized
+      nodeSelector:
+         node-role.kubernetes.io/master: ""
+      serviceAccount: gpushare-schd-extender
+      containers:
+        - name: gpushare-schd-extender
+          image: "{{ .Values.images.extender.image }}:{{ .Values.images.extender.tag }}"
+          imagePullPolicy: {{ .Values.images.extender.pullPolicy }}
+          env:
+          - name: LOG_LEVEL
+            value: debug
+          - name: PORT
+            value: "12345"
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-rbac.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-rbac.yaml
@@ -0,0 +1,67 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-schd-extender
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - update
+  - patch
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - bindings
+  - pods/binding
+  verbs:
+  - create
+- apiGroups:
+  - ""
+  resources:
+  - configmaps
+  verbs:
+  - get
+  - list
+  - watch
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: gpushare-schd-extender
+subjects:
+- kind: ServiceAccount
+  name: gpushare-schd-extender
+  namespace: kube-system
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-service.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/gpushare-extender-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+  labels:
+    app: gpushare
+    component: gpushare-schd-extender
+spec:
+  # type: ClusterIP
+  type: NodePort
+  ports:
+  - port: 12345
+    name: http
+    targetPort: 12345
+    nodePort: 32766
+  selector:
+    app: gpushare
+    component: gpushare-schd-extender
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/schd-config-job.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/templates/schd-config-job.yaml
@@ -0,0 +1,66 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: gpushare-installer
+  namespace: kube-system
+  labels:
+    app: gpushare
+    name: gpushare-installer
+    chart: {{ template "gpushare-installer.chart" . }}
+    release: {{ .Release.Name }}
+    heritage: {{ .Release.Service }}
+spec:
+  parallelism: {{ .Values.masterCount }}
+  template:
+    metadata:
+      labels: 
+        chart: {{ template "gpushare-installer.chart" . }}
+        release: {{ .Release.Name }}
+        heritage: {{ .Release.Service }}
+        app: gpushare
+        name: gpushare-installer
+    spec:
+      hostNetwork: true
+      tolerations:
+      - effect: NoSchedule
+        operator: Exists
+        key: node-role.kubernetes.io/master
+      - effect: NoSchedule
+        operator: Exists
+        key: node.cloudprovider.kubernetes.io/uninitialized
+      nodeSelector:
+         node-role.kubernetes.io/master: ""
+      restartPolicy: OnFailure
+      containers:
+      - name: deploy-schd
+        image: "{{ .Values.images.installer.image }}:{{ .Values.images.installer.tag }}"
+        imagePullPolicy: {{ .Values.images.installer.pullPolicy }}
+        securityContext:
+          privileged: true
+        command: 
+          - bash
+          - /schd-extender/install-sched-extender-on-host.sh
+        env:
+          - name: NODE_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+        volumeMounts:
+        - name: kube-dir
+          mountPath: /etc/kubernetes
+      volumes:
+      - hostPath:
+          path: /etc/kubernetes
+          type: Directory
+        name: kube-dir
+
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                  - key: name
+                    operator: In
+                    values:
+                       - gpushare-installer
+              topologyKey: "kubernetes.io/hostname"
--- a/gpushare-scheduler-extender/deployer/chart/gpushare-installer/values.yaml
+++ b/gpushare-scheduler-extender/deployer/chart/gpushare-installer/values.yaml
@@ -0,0 +1,29 @@
+# Default values for gpushare-installer.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+masterCount: 3
+
+images:
+   extender:
+      image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-schd-extender"
+      tag: v1.0.0-ce6f800-aliyun
+      pullPolicy: IfNotPresent
+   installer:
+      image: "registry.cn-beijing.aliyuncs.com/acs/schd-extender-deployer"
+      tag: v1.0.0-b56d26d-aliyun
+      pullPolicy: IfNotPresent
+   devicePlugin:
+      image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-plugin"
+      tag: v1.0.0-2656995-aliyun
+      pullPolicy: IfNotPresent
+   evictor:
+      image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-evict"
+      tag: v1.0.0-b56d26d-aliyun
+      pullPolicy: IfNotPresent
+   recover:
+      image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-recover"
+      tag: v1.0.0-b56d26d-aliyun
+      pullPolicy: IfNotPresent
+
+