Upload 4 files
Browse files- README.md +33 -38
- docker-compose.yaml +13 -13
- hpa.yaml +18 -0
- k8s.yaml +12 -18
README.md
CHANGED
@@ -1,58 +1,53 @@
|
|
1 |
-
---
|
2 |
-
title: AI Inference Architecture for Healthcare
|
3 |
-
emoji: 🧠
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: green
|
6 |
-
sdk: static
|
7 |
-
app_file: index.html
|
8 |
-
pinned: false
|
9 |
-
tags:
|
10 |
-
- healthcare
|
11 |
-
- docker
|
12 |
-
- fastapi
|
13 |
-
- kubernetes
|
14 |
-
- triton-inference-server
|
15 |
-
- llm-inference
|
16 |
-
- production-ready
|
17 |
-
---
|
18 |
-
|
19 |
# AI Inference Architecture for Healthcare
|
20 |
|
21 |
-
This project provides a scalable, production-ready AI inference architecture designed for healthcare and pharmaceutical applications. It integrates Triton Inference Server, FastAPI,
|
22 |
-
|
23 |
-
## Key Features
|
24 |
|
25 |
-
|
26 |
-
- Routing layer using FastAPI or NGINX
|
27 |
-
- LLM model support via TorchScript / ONNX
|
28 |
-
- Optional user auth, billing hooks, and monitoring
|
29 |
-
- Designed for HIPAA-compliant environments
|
30 |
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
- **Production (Kubernetes)**: via `k8s.yaml`
|
35 |
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
##
|
39 |
|
|
|
40 |
```bash
|
41 |
docker compose up --build
|
42 |
```
|
43 |
|
44 |
-
|
45 |
-
|
46 |
```bash
|
47 |
kubectl apply -f k8s.yaml
|
|
|
48 |
```
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
##
|
53 |
|
54 |
-
|
|
|
55 |
|
56 |
-
##
|
|
|
|
|
|
|
57 |
|
58 |
-
Apache 2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# AI Inference Architecture for Healthcare
|
2 |
|
3 |
+
This project provides a scalable, production-ready AI inference architecture designed for healthcare and pharmaceutical applications. It integrates Triton Inference Server, FastAPI, and Kubernetes to support high-throughput model inference.
|
|
|
|
|
4 |
|
5 |
+
## 🚀 Key Features
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
- Modular container-based architecture with FastAPI gateway
|
8 |
+
- Supports NLP and CV models with optional preprocessing
|
9 |
+
- Inference via Triton Inference Server using ONNX or TorchScript models
|
10 |
+
- GitHub Actions-powered CI/CD pipeline to auto-deploy model updates
|
11 |
+
- Kubernetes-based pod management, autoscaling, and volume mounting
|
12 |
+
- Full observability stack: Prometheus + Grafana for metrics and monitoring
|
13 |
+
- Compliant with HIPAA-aligned standards: secure APIs, logging, encryption
|
14 |
|
15 |
+
## 🧱 Architecture Overview
|
|
|
16 |
|
17 |
+
```
|
18 |
+
Healthcare/Pharma Clients → FastAPI Gateway → Optional Preprocessor → Triton Pod
|
19 |
+
↓ ↓ ↓ ↓
|
20 |
+
Model Registry ← GitHub CI/CD Pipeline ← Kubernetes ← Monitoring (Prometheus + Grafana)
|
21 |
+
```
|
22 |
|
23 |
+
## ⚙️ Deployment Options
|
24 |
|
25 |
+
### ▶️ Local (Docker Compose)
|
26 |
```bash
|
27 |
docker compose up --build
|
28 |
```
|
29 |
|
30 |
+
### ☸️ Kubernetes (Production)
|
|
|
31 |
```bash
|
32 |
kubectl apply -f k8s.yaml
|
33 |
+
kubectl apply -f hpa.yaml
|
34 |
```
|
35 |
|
36 |
+
## 📦 Model Lifecycle
|
37 |
+
|
38 |
+
1. Train model locally or in pipeline (e.g., PyTorch/ONNX)
|
39 |
+
2. Push model to GitHub repository
|
40 |
+
3. GitHub Actions CI/CD triggers and pushes model to Model Registry
|
41 |
+
4. Kubernetes mounts model volume into Triton pod
|
42 |
+
5. Triton automatically reloads model
|
43 |
|
44 |
+
## 🔍 Monitoring and Observability
|
45 |
|
46 |
+
- Metrics via Prometheus sidecar scraping port 8002 on Triton pod
|
47 |
+
- Dashboards in Grafana track latency, throughput, failures
|
48 |
|
49 |
+
## 🧪 Sample Inference Request
|
50 |
+
```bash
|
51 |
+
curl -X POST http://localhost:8000/infer -H "Content-Type: application/json" -d '{"input": "Patient data or image here"}'
|
52 |
+
```
|
53 |
|
|
docker-compose.yaml
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
version:
|
|
|
2 |
services:
|
3 |
-
|
4 |
-
image:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
ports:
|
6 |
- "8000:8000"
|
7 |
- "8001:8001"
|
|
|
8 |
volumes:
|
9 |
- ./models:/models
|
10 |
-
command: [
|
11 |
-
"tritonserver",
|
12 |
-
"--model-repository=/models"
|
13 |
-
]
|
14 |
-
api:
|
15 |
-
image: tiangolo/uvicorn-gunicorn-fastapi:python3.9
|
16 |
-
volumes:
|
17 |
-
- ./app:/app
|
18 |
-
ports:
|
19 |
-
- "8080:80"
|
|
|
1 |
+
version: '3.8'
|
2 |
+
|
3 |
services:
|
4 |
+
fastapi:
|
5 |
+
image: tiangolo/uvicorn-gunicorn-fastapi:python3.9
|
6 |
+
ports:
|
7 |
+
- "8000:80"
|
8 |
+
volumes:
|
9 |
+
- ./app:/app
|
10 |
+
|
11 |
+
triton:
|
12 |
+
image: nvcr.io/nvidia/tritonserver:22.10-py3
|
13 |
+
command: ["tritonserver", "--model-repository=/models"]
|
14 |
ports:
|
15 |
- "8000:8000"
|
16 |
- "8001:8001"
|
17 |
+
- "8002:8002"
|
18 |
volumes:
|
19 |
- ./models:/models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hpa.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
apiVersion: autoscaling/v2
|
2 |
+
kind: HorizontalPodAutoscaler
|
3 |
+
metadata:
|
4 |
+
name: triton-hpa
|
5 |
+
spec:
|
6 |
+
scaleTargetRef:
|
7 |
+
apiVersion: apps/v1
|
8 |
+
kind: Deployment
|
9 |
+
name: triton-deployment
|
10 |
+
minReplicas: 1
|
11 |
+
maxReplicas: 5
|
12 |
+
metrics:
|
13 |
+
- type: Resource
|
14 |
+
resource:
|
15 |
+
name: cpu
|
16 |
+
target:
|
17 |
+
type: Utilization
|
18 |
+
averageUtilization: 70
|
k8s.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
apiVersion: apps/v1
|
2 |
kind: Deployment
|
3 |
metadata:
|
4 |
-
name: triton-
|
5 |
spec:
|
6 |
replicas: 1
|
7 |
selector:
|
@@ -11,28 +11,22 @@ spec:
|
|
11 |
metadata:
|
12 |
labels:
|
13 |
app: triton
|
|
|
|
|
|
|
14 |
spec:
|
15 |
containers:
|
16 |
- name: triton
|
17 |
-
image: nvcr.io/nvidia/tritonserver:
|
|
|
18 |
ports:
|
19 |
- containerPort: 8000
|
20 |
-
|
|
|
21 |
volumeMounts:
|
22 |
-
-
|
23 |
-
|
24 |
volumes:
|
25 |
- name: model-volume
|
26 |
-
|
27 |
-
|
28 |
-
apiVersion: v1
|
29 |
-
kind: Service
|
30 |
-
metadata:
|
31 |
-
name: triton-service
|
32 |
-
spec:
|
33 |
-
selector:
|
34 |
-
app: triton
|
35 |
-
ports:
|
36 |
-
- protocol: TCP
|
37 |
-
port: 80
|
38 |
-
targetPort: 8000
|
|
|
1 |
apiVersion: apps/v1
|
2 |
kind: Deployment
|
3 |
metadata:
|
4 |
+
name: triton-deployment
|
5 |
spec:
|
6 |
replicas: 1
|
7 |
selector:
|
|
|
11 |
metadata:
|
12 |
labels:
|
13 |
app: triton
|
14 |
+
annotations:
|
15 |
+
prometheus.io/scrape: "true"
|
16 |
+
prometheus.io/port: "8002"
|
17 |
spec:
|
18 |
containers:
|
19 |
- name: triton
|
20 |
+
image: nvcr.io/nvidia/tritonserver:22.10-py3
|
21 |
+
args: ["tritonserver", "--model-repository=/models"]
|
22 |
ports:
|
23 |
- containerPort: 8000
|
24 |
+
- containerPort: 8001
|
25 |
+
- containerPort: 8002
|
26 |
volumeMounts:
|
27 |
+
- name: model-volume
|
28 |
+
mountPath: /models
|
29 |
volumes:
|
30 |
- name: model-volume
|
31 |
+
persistentVolumeClaim:
|
32 |
+
claimName: model-pvc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|