Spaces:

abugaber
/

aiben

Build error

File size: 32,286 Bytes
{{- if and .Values.vllm.enabled .Values.tgi.enabled }}
  {{- fail "Both TGI and vLLM cannot be enabled at the same time. Enable only one and try again" }}
{{- end }}
{{- if and .Values.vllm.enabled .Values.lmdeploy.enabled }}
  {{- fail "Both lmdeploy and vLLM cannot be enabled at the same time. Enable only one and try again" }}
{{- end }}
{{- if and .Values.lmdeploy.enabled .Values.tgi.enabled }}
  {{- fail "Both TGI and lmdeploy cannot be enabled at the same time. Enable only one and try again" }}
{{- end }}
{{- if .Values.h2ogpt.stack.enabled }}
  {{- if not (and .Values.vllm.enabled .Values.h2ogpt.enabled) }}
    {{- fail "If h2oGPT stack is enabled, both vLLM and h2oGPT should be enabled" }}
  {{- end }}
{{- end }}
---
{{- if .Values.h2ogpt.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}
  namespace: {{ include "h2ogpt.namespace" . | quote }}
  labels:
    app: {{ include "h2ogpt.fullname" . }}
spec:
  {{- if not .Values.h2ogpt.autoscaling.enabled }}
  replicas: {{ .Values.h2ogpt.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}
  {{- if .Values.h2ogpt.updateStrategy }}
  strategy: {{- toYaml .Values.h2ogpt.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.h2ogpt.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}
        {{- with .Values.h2ogpt.podLabels }}
        {{ toYaml . | nindent 8 }}
        {{- end }}
    spec:
      {{- with .Values.h2ogpt.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.h2ogpt.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.h2ogpt.podSecurityContext | nindent 8 }}
      affinity:
        {{- if .Values.h2ogpt.podAffinity }}
        podAntiAffinity:
          {{- if .Values.h2ogpt.podAffinity.hostname }}
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: app
                    operator: In
                    values:
                      - {{ include "h2ogpt.fullname" . }}
              topologyKey: kubernetes.io/hostname
          {{- end }}
          {{- if .Values.h2ogpt.podAffinity.zone }}
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
          {{- end }}
        {{- end }}
      {{- with .Values.h2ogpt.extraAffinity }}
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.h2ogpt.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        {{- if .Values.h2ogpt.stack.enabled }}
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference
          securityContext:
            {{- toYaml .Values.vllm.securityContext | nindent 12 }}
          image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
          command: ["python3"]
          args: 
            - "-m" 
            - "vllm.entrypoints.openai.api_server"
            - "--port"
            - "5000"
            - "--host"
            - "0.0.0.0"
            - "--download-dir"
            - "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 5000
              protocol: TCP
          {{- if .Values.vllm.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.vllm.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.vllm.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
          env:
          - name: NCCL_IGNORE_DISABLED_P2P
            value: "1"
          {{- range $key, $value := .Values.vllm.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /dev/shm
              subPath: shm
          {{- end }}
        - name: {{ include "h2ogpt.fullname" . }}
          securityContext:
            {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
          image: "{{ .Values.h2ogpt.image.repository }}:{{ .Values.h2ogpt.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.h2ogpt.image.pullPolicy }}
          command: ["/bin/bash", "-c"]
          {{- if .Values.h2ogpt.stack.enabled }}
          args:
          - >
            while [[ "$(curl --insecure -s -o /dev/null -w ''%{http_code}''
            http://localhost:5000/v1/models)" != "200" ]]; do
              echo "Waiting for inference service to become ready... (2sec)"
              sleep 2
            done

            python3 /workspace/generate.py
          {{- end }}
          {{- if not .Values.h2ogpt.stack.enabled }}
          {{- if and .Values.vllm.enabled (not .Values.h2ogpt.externalLLM.modelLock) }}
          args:
          - >
            until wget -O- http://{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}/v1/models >/dev/null 2>&1;
              do
                echo "Waiting for inference service to become ready...";
                sleep 5;
              done
              
            python3 /workspace/generate.py
          {{- end }}
          {{- if and .Values.tgi.enabled (not .Values.h2ogpt.externalLLM.modelLock) }}
          args:
          - >
            until wget -O- http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}/ >/dev/null 2>&1;
              do
                echo "Waiting for inference service to become ready...";
                sleep 5;
              done
              
            python3 /workspace/generate.py
          {{- end }}
          {{- if and .Values.lmdeploy.enabled (not .Values.h2ogpt.externalLLM.modelLock) }}
          args:
          - >
            until wget -O- http://{{ include "h2ogpt.fullname" . }}-lmdeploy-inference:{{ .Values.lmdeploy.service.port }}/ >/dev/null 2>&1;
              do
                echo "Waiting for inference service to become ready...";
                sleep 5;
              done
              
            python3 /workspace/generate.py
          {{- end }}
          {{- if and .Values.h2ogpt.enabled (not (or .Values.vllm.enabled .Values.tgi.enabled .Values.lmdeploy.enabled)) }}
          args:
          - >
            python3 /workspace/generate.py
          {{- end }}
          {{- end }}
          ports:
            - name: http
              containerPort: 7860
              protocol: TCP
            - name: gpt
              containerPort: 8888
              protocol: TCP
            - name: openai
              containerPort: 5000
              protocol: TCP
            - name: function
              containerPort: 5002
              protocol: TCP
            - name: agent
              containerPort: 5004
              protocol: TCP
          {{- if .Values.h2ogpt.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.h2ogpt.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.h2ogpt.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.h2ogpt.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.h2ogpt.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-config
          env:
          {{- if and .Values.tgi.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
          - name: h2ogpt_inference_server
            value: "http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}"
          {{- end }}
          {{- if and .Values.vllm.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
          - name: h2ogpt_inference_server
            value: "vllm:{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}"
          {{- end }}
          {{- if and .Values.lmdeploy.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
          - name: h2ogpt_inference_server
            value: "http://{{ include "h2ogpt.fullname" . }}-lmdeploy-inference:{{ .Values.lmdeploy.service.port }}"
          {{- end }}
          {{- if and .Values.h2ogpt.stack.enabled (not .Values.h2ogpt.externalLLM.enabled)  }}
          - name: h2ogpt_inference_server
            value: "vllm:localhost:5000"
          {{- end }}
          {{- range $key, $value := .Values.h2ogpt.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          {{- if and .Values.h2ogpt.externalLLM.openAIAzure.enabled .Values.h2ogpt.externalLLM.enabled }}
          - name: OPENAI_AZURE_KEY
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: OPENAI_AZURE_KEY
          - name: OPENAI_AZURE_API_BASE
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: OPENAI_AZURE_API_BASE
          {{- end }}
          {{- if and .Values.h2ogpt.externalLLM.openAI.enabled .Values.h2ogpt.externalLLM.enabled }}
          - name: OPENAI_API_KEY
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: OPENAI_API_KEY
          {{- end }}
          {{- if and .Values.h2ogpt.externalLLM.replicate.enabled .Values.h2ogpt.externalLLM.enabled }}
          - name: REPLICATE_API_TOKEN
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: REPLICATE_API_TOKEN
          {{- end }}
          {{- if .Values.h2ogpt.externalLLM.enabled }}
          - name: H2OGPT_MODEL_LOCK
            value: {{ toJson .Values.h2ogpt.externalLLM.modelLock | quote }}
          - name: H2OGPT_SCORE_MODEL
            value: None
          {{- end }}
          {{- if .Values.h2ogpt.visionModels.enabled }}
          - name: H2OGPT_VISIBLE_VISION_MODELS
            value: {{ .Values.h2ogpt.visionModels.visibleModels | quote }}
          - name: H2OGPT_ROTATE_ALIGN_RESIZE_IMAGE
            value: {{ .Values.h2ogpt.visionModels.rotateAlignResizeImage | quote }}
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-volume
              mountPath: /workspace/save
              subPath: save
            {{- if .Values.caCertificates }}
            - name: ca-certificates
              mountPath: /etc/ssl/certs/root-ca-bundle.crt
              subPath: root-ca-bundle.crt
            {{- end }}
            {{ with .Values.h2ogpt.extraVolumeMounts }}
            {{- toYaml . | nindent 12 }}
            {{- end }}
      volumes:
        - name: {{ include "h2ogpt.fullname" . }}-volume
          {{- if not .Values.h2ogpt.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName:  {{ include "h2ogpt.fullname" . }}-volume          
          {{- else}}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.h2ogpt.storage.size | quote }}
                storageClassName: {{ .Values.h2ogpt.storage.class }}
          {{- end }}
        {{- if .Values.h2ogpt.stack.enabled }}
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- if not .Values.vllm.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.vllm.storage.size | quote }}
                storageClassName: {{ .Values.vllm.storage.class }}
          {{- end }}
        {{- end }}
        {{- if .Values.caCertificates }}
        - name: ca-certificates
          configMap:
            name: {{ include "h2ogpt.fullname" . }}-ca-certificates
        {{- end }}
        {{- with .Values.h2ogpt.extraVolumes }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
{{- end }}
---
{{- if and (.Values.h2ogpt.enabled) (not .Values.h2ogpt.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-volume
  namespace: {{ include "h2ogpt.namespace" . | quote }}
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
  storageClassName: {{ .Values.h2ogpt.storage.class }}
  resources:
    requests:
      storage: {{ .Values.h2ogpt.storage.size | quote }}
{{- end }}

---
{{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}-tgi-inference
  namespace: {{ include "h2ogpt.namespace" . | quote }}
  labels:
    app: {{ include "h2ogpt.fullname" . }}-tgi-inference
spec:
  {{- if not .Values.tgi.autoscaling.enabled }}
  replicas: {{ .Values.tgi.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}-tgi-inference
  {{- if .Values.tgi.updateStrategy }}
  strategy: {{- toYaml .Values.tgi.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.tgi.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}-tgi-inference
        {{- with .Values.tgi.podLabels }}
        {{ toYaml . | nindent 6 }}
        {{- end }}
    spec:
      {{- with .Values.tgi.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tgi.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.tgi.podSecurityContext | nindent 8 }}
      affinity:
        {{- if .Values.tgi.podAffinity }}
        podAntiAffinity:
          {{- if .Values.tgi.podAffinity.hostname }}
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: app
                    operator: In
                    values:
                      - {{ include "h2ogpt.fullname" . }}
              topologyKey: kubernetes.io/hostname
          {{- end }}
          {{- if .Values.tgi.podAffinity.zone }}
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
          {{- end }}
        {{- end }}
      {{- with .Values.tgi.extraAffinity }}
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tgi.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        - name: {{ include "h2ogpt.fullname" . }}-tgi-inference
          securityContext:
            {{- toYaml .Values.tgi.securityContext | nindent 12 }}
          image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag }}"
          imagePullPolicy: {{ .Values.tgi.image.pullPolicy }}
          command: []
          args: 
{{- range $arg := .Values.tgi.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 80
              protocol: TCP
          {{- if .Values.tgi.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.tgi.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.tgi.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.tgi.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.tgi.resources | nindent 12 }}
          env:
          {{- range $key, $value := .Values.tgi.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-tgi-inference-config
            - secretRef:
                name: {{ .Values.tgi.hfSecret }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
              mountPath: /app/cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
              mountPath: /data
              subPath: data
            - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
              mountPath: /dev/shm
              subPath: shm
      volumes:
        {{- if .Values.h2ogpt.stack.enabled }}
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- if not .Values.vllm.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.vllm.storage.size | quote }}
                storageClassName: {{ .Values.vllm.storage.class }}
          {{- end }}
        {{- end }}
        - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
        {{- if not .Values.tgi.storage.useEphemeral}}
          persistentVolumeClaim:
            claimName:  {{ include "h2ogpt.fullname" . }}-tgi-inference-volume      
          {{- else}}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.tgi.storage.size | quote }}
                storageClassName: {{ .Values.tgi.storage.class }}
          {{- end }}
{{- end }}
---
{{- if and (.Values.tgi.enabled) (not .Values.tgi.storage.useEphemeral)}}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
  namespace: {{ include "h2ogpt.namespace" . | quote }}
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
  storageClassName: {{ .Values.tgi.storage.class }}
  resources:
    requests:
      storage: {{ .Values.tgi.storage.size | quote }}
{{- end }}
---
{{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled )}}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}-vllm-inference
  namespace: {{ include "h2ogpt.namespace" . | quote }}
  labels:
    app: {{ include "h2ogpt.fullname" . }}-vllm-inference
spec:
  {{- if not .Values.vllm.autoscaling.enabled }}
  replicas: {{ .Values.vllm.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}-vllm-inference
  {{- if .Values.vllm.updateStrategy }}
  strategy: {{- toYaml .Values.vllm.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.vllm.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}-vllm-inference
        {{- with .Values.vllm.podLabels }}
        {{ toYaml . | nindent 8 }}
        {{- end }}
    spec:
      {{- with .Values.vllm.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.vllm.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.vllm.podSecurityContext | nindent 8 }}
      affinity:
        {{- if .Values.vllm.podAffinity }}
        podAntiAffinity:
          {{- if .Values.vllm.podAffinity.hostname }}
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: app
                    operator: In
                    values:
                      - {{ include "h2ogpt.fullname" . }}
              topologyKey: kubernetes.io/hostname
          {{- end }}
          {{- if .Values.vllm.podAffinity.zone }}
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
          {{- end }}
        {{- end }}
      {{- with .Values.vllm.extraAffinity }}
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.vllm.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference
          securityContext:
            {{- toYaml .Values.vllm.securityContext | nindent 12 }}
          image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
          command: ["python3"]
          args: 
            - "-m" 
            - "vllm.entrypoints.openai.api_server"
            - "--port"
            - "5000"
            - "--host"
            - "0.0.0.0"
            - "--download-dir"
            - "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 5000
              protocol: TCP
          {{- if .Values.vllm.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.vllm.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.vllm.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
          env:
          - name: NCCL_IGNORE_DISABLED_P2P
            value: "1"
          {{- range $key, $value := .Values.vllm.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: shm
              mountPath: /dev/shm
      volumes:
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- if not .Values.vllm.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.vllm.storage.size | quote }}
                storageClassName: {{ .Values.vllm.storage.class }}
          {{- end }}
        - emptyDir: 
            medium: Memory
            sizeLimit: 10.24Gi
          name: shm          
{{- end }}
---
{{- if and (.Values.vllm.enabled) (not .Values.vllm.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
  namespace: {{ include "h2ogpt.namespace" . | quote }}
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.vllm.storage.class | quote }}
  storageClassName: {{ .Values.vllm.storage.class }}
  resources:
    requests:
      storage: {{ .Values.vllm.storage.size | quote }}
{{- end }}
---
{{- if and (.Values.lmdeploy.enabled) (not .Values.h2ogpt.stack.enabled )}}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference
  namespace: {{ include "h2ogpt.namespace" . | quote }}
  labels:
    app: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference
spec:
  {{- if not .Values.lmdeploy.autoscaling.enabled }}
  replicas: {{ .Values.lmdeploy.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference
  {{- if .Values.lmdeploy.updateStrategy }}
  strategy: {{- toYaml .Values.lmdeploy.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.lmdeploy.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference
        {{- with .Values.lmdeploy.podLabels }}
        {{ toYaml . | nindent 8 }}
        {{- end }}
    spec:
      {{- with .Values.lmdeploy.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.lmdeploy.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.lmdeploy.podSecurityContext | nindent 8 }}
      affinity:
        {{- if .Values.lmdeploy.podAffinity }}
        podAntiAffinity:
          {{- if .Values.lmdeploy.podAffinity.hostname }}
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: app
                    operator: In
                    values:
                      - {{ include "h2ogpt.fullname" . }}
              topologyKey: kubernetes.io/hostname
          {{- end }}
          {{- if .Values.lmdeploy.podAffinity.zone }}
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
          {{- end }}
        {{- end }}
      {{- with .Values.lmdeploy.extraAffinity }}
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.lmdeploy.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        - name: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference
          securityContext:
            {{- toYaml .Values.lmdeploy.securityContext | nindent 12 }}
          image: "{{ .Values.lmdeploy.image.repository }}:{{ .Values.lmdeploy.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.lmdeploy.image.pullPolicy }}
          command: ["lmdeploy"]
          args:
            - "serve"
            - "api_server"
{{- range $arg := .Values.lmdeploy.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 23333
              protocol: TCP
          {{- if .Values.lmdeploy.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.lmdeploy.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.lmdeploy.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.lmdeploy.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.lmdeploy.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference-config
          env:
          - name: NCCL_IGNORE_DISABLED_P2P
            value: "1"
          - name: HF_HOME
            value: "/workspace/.cache"
          {{- range $key, $value := .Values.lmdeploy.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: shm
              mountPath: /dev/shm
      volumes:
        - name: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference-volume
          {{- if not .Values.lmdeploy.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes:
                  - ReadWriteOnce
                resources:
                  requests:
                    storage: {{ .Values.lmdeploy.storage.size | quote }}
                storageClassName: {{ .Values.lmdeploy.storage.class }}
          {{- end }}
        - emptyDir:
            medium: Memory
            sizeLimit: 10.24Gi
          name: shm
{{- end }}
---
{{- if and (.Values.lmdeploy.enabled) (not .Values.lmdeploy.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-lmdeploy-inference-volume
  namespace: {{ include "h2ogpt.namespace" . | quote }}
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.lmdeploy.storage.class | quote }}
  storageClassName: {{ .Values.lmdeploy.storage.class }}
  resources:
    requests:
      storage: {{ .Values.lmdeploy.storage.size | quote }}
{{- end }}