synchronization

This commit is contained in:
2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj

View File

@@ -0,0 +1,5 @@
apiVersion: v1
appVersion: "1.0"
description: A Helm chart for Kubernetes
name: gpushare-installer
version: 0.7.0

View File

@@ -0,0 +1,27 @@
### 0.1.0
* support gpushare deployment
### 0.2.0
* fix not recover gpu exclusive scheduling after removing gpushare
### 0.3.0
* support helm v3
### 0.4.0
* delete env kubeVersion
### 0.5.0
* change mount dir of host to /etc/kubernetes
### 0.6.0
* change statefulset to job
### 0.7.0
* Support unhealthy configmap

View File

@@ -0,0 +1,32 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "gpushare-installer.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "gpushare-installer.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "gpushare-installer.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}

View File

@@ -0,0 +1,43 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: device-plugin-evict-ds
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-evict-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-evict-ds
spec:
hostNetwork: true
nodeSelector:
gpushare: "true"
containers:
- image: "{{ .Values.images.evictor.image }}:{{ .Values.images.evictor.tag }}"
imagePullPolicy: {{ .Values.images.evictor.pullPolicy }}
command:
- bash
- /dp-evict/dp-evict-on-host.sh
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
securityContext:
privileged: true
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir

View File

@@ -0,0 +1,52 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: device-plugin-recover-ds
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-recover-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: device-plugin-recover-ds
spec:
nodeSelector:
gpushare: "false"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aliyun.accelerator/nvidia_count
operator: Exists
# nodeSelector:
# gpu-instance: "true"
hostNetwork: true
containers:
- image: "{{ .Values.images.recover.image }}:{{ .Values.images.recover.tag }}"
imagePullPolicy: {{ .Values.images.recover.pullPolicy }}
command:
- bash
- /dp-evict/dp-recover-on-host.sh
name: gpushare
# Make this pod as Guaranteed pod which will never be recovered because of node's resource consumption.
securityContext:
privileged: true
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir

View File

@@ -0,0 +1,61 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpushare-device-plugin-ds
namespace: kube-system
spec:
selector:
matchLabels:
component: gpushare-device-plugin
app: gpushare
name: gpushare-device-plugin-ds
type: runtime
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: gpushare-device-plugin-ds
type: runtime
spec:
serviceAccount: gpushare-device-plugin
hostNetwork: true
nodeSelector:
gpushare: "true"
containers:
- image: "{{ .Values.images.devicePlugin.image }}:{{ .Values.images.devicePlugin.tag }}"
imagePullPolicy: {{ .Values.images.devicePlugin.pullPolicy }}
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
command:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
- --memory-unit=GiB
resources:
limits:
memory: "300Mi"
cpu: "1"
requests:
memory: "300Mi"
cpu: "1"
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins

View File

@@ -0,0 +1,59 @@
# rbac.yaml
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpushare-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: gpushare-device-plugin
subjects:
- kind: ServiceAccount
name: gpushare-device-plugin
namespace: kube-system

View File

@@ -0,0 +1,45 @@
# deployment yaml
---
kind: Deployment
apiVersion: apps/v1
metadata:
name: gpushare-schd-extender
namespace: kube-system
spec:
selector:
matchLabels:
app: gpushare
component: gpushare-schd-extender
type: runtime
replicas: 1
strategy:
type: Recreate
template:
metadata:
labels:
app: gpushare
component: gpushare-schd-extender
type: runtime
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
spec:
hostNetwork: true
tolerations:
- effect: NoSchedule
operator: Exists
key: node-role.kubernetes.io/master
- effect: NoSchedule
operator: Exists
key: node.cloudprovider.kubernetes.io/uninitialized
nodeSelector:
node-role.kubernetes.io/master: ""
serviceAccount: gpushare-schd-extender
containers:
- name: gpushare-schd-extender
image: "{{ .Values.images.extender.image }}:{{ .Values.images.extender.tag }}"
imagePullPolicy: {{ .Values.images.extender.pullPolicy }}
env:
- name: LOG_LEVEL
value: debug
- name: PORT
value: "12345"

View File

@@ -0,0 +1,67 @@
# rbac.yaml
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-schd-extender
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- bindings
- pods/binding
verbs:
- create
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- list
- watch
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpushare-schd-extender
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-schd-extender
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: gpushare-schd-extender
subjects:
- kind: ServiceAccount
name: gpushare-schd-extender
namespace: kube-system

View File

@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
name: gpushare-schd-extender
namespace: kube-system
labels:
app: gpushare
component: gpushare-schd-extender
spec:
# type: ClusterIP
type: NodePort
ports:
- port: 12345
name: http
targetPort: 12345
nodePort: 32766
selector:
app: gpushare
component: gpushare-schd-extender

View File

@@ -0,0 +1,66 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gpushare-installer
namespace: kube-system
labels:
app: gpushare
name: gpushare-installer
chart: {{ template "gpushare-installer.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
spec:
parallelism: {{ .Values.masterCount }}
template:
metadata:
labels:
chart: {{ template "gpushare-installer.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
app: gpushare
name: gpushare-installer
spec:
hostNetwork: true
tolerations:
- effect: NoSchedule
operator: Exists
key: node-role.kubernetes.io/master
- effect: NoSchedule
operator: Exists
key: node.cloudprovider.kubernetes.io/uninitialized
nodeSelector:
node-role.kubernetes.io/master: ""
restartPolicy: OnFailure
containers:
- name: deploy-schd
image: "{{ .Values.images.installer.image }}:{{ .Values.images.installer.tag }}"
imagePullPolicy: {{ .Values.images.installer.pullPolicy }}
securityContext:
privileged: true
command:
- bash
- /schd-extender/install-sched-extender-on-host.sh
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
volumeMounts:
- name: kube-dir
mountPath: /etc/kubernetes
volumes:
- hostPath:
path: /etc/kubernetes
type: Directory
name: kube-dir
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: name
operator: In
values:
- gpushare-installer
topologyKey: "kubernetes.io/hostname"

View File

@@ -0,0 +1,29 @@
# Default values for gpushare-installer.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
masterCount: 3
images:
extender:
image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-schd-extender"
tag: v1.0.0-ce6f800-aliyun
pullPolicy: IfNotPresent
installer:
image: "registry.cn-beijing.aliyuncs.com/acs/schd-extender-deployer"
tag: v1.0.0-b56d26d-aliyun
pullPolicy: IfNotPresent
devicePlugin:
image: "registry.cn-beijing.aliyuncs.com/acs/k8s-gpushare-plugin"
tag: v1.0.0-2656995-aliyun
pullPolicy: IfNotPresent
evictor:
image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-evict"
tag: v1.0.0-b56d26d-aliyun
pullPolicy: IfNotPresent
recover:
image: "registry.cn-beijing.aliyuncs.com/acs/nvidia-device-plugin-recover"
tag: v1.0.0-b56d26d-aliyun
pullPolicy: IfNotPresent