Joash commited on
Commit
5f0bb6b
·
1 Parent(s): 1f37a6a

Optimize memory usage with 4-bit quantization and Docker settings

Browse files
Files changed (2) hide show
  1. Dockerfile +7 -4
  2. src/model_manager.py +11 -3
Dockerfile CHANGED
@@ -4,10 +4,11 @@ FROM python:3.11-slim
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
  build-essential \
10
  curl \
 
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
  # Create necessary directories with proper permissions
@@ -28,6 +29,8 @@ ENV HF_HOME=/home/user/.cache/huggingface
28
  ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
29
  # Set logging to stdout
30
  ENV LOG_FILE=/dev/stdout
 
 
31
 
32
  # Switch to non-root user
33
  USER user
@@ -39,7 +42,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
39
  # Copy requirements first to leverage Docker cache
40
  COPY --chown=user:user requirements.txt .
41
 
42
- # Install Python dependencies
43
  RUN pip install --user --no-cache-dir -r requirements.txt
44
 
45
  # Copy application code
@@ -48,5 +51,5 @@ COPY --chown=user:user . .
48
  # Expose port for Hugging Face Spaces
49
  EXPOSE 7860
50
 
51
- # Run the application with logging
52
- CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug"]
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
+ # Install system dependencies including cuda-toolkit for bitsandbytes
8
  RUN apt-get update && apt-get install -y \
9
  build-essential \
10
  curl \
11
+ git \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Create necessary directories with proper permissions
 
29
  ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
30
  # Set logging to stdout
31
  ENV LOG_FILE=/dev/stdout
32
+ # Reduce memory usage
33
+ ENV MALLOC_ARENA_MAX=2
34
 
35
  # Switch to non-root user
36
  USER user
 
42
  # Copy requirements first to leverage Docker cache
43
  COPY --chown=user:user requirements.txt .
44
 
45
+ # Install Python dependencies with reduced memory usage
46
  RUN pip install --user --no-cache-dir -r requirements.txt
47
 
48
  # Copy application code
 
51
  # Expose port for Hugging Face Spaces
52
  EXPOSE 7860
53
 
54
+ # Run the application with reduced memory usage
55
+ CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]
src/model_manager.py CHANGED
@@ -1,5 +1,5 @@
1
  import logging
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
3
  import torch
4
  from huggingface_hub import login
5
  from .config import Config
@@ -54,11 +54,19 @@ class ModelManager:
54
  logger.info(f"Loading model: {self.model_name}")
55
  logger.info(f"Using device: {self.device}")
56
 
57
- # Load model with CPU configuration
 
 
 
 
 
 
 
 
58
  self.model = AutoModelForCausalLM.from_pretrained(
59
  self.model_name,
60
  device_map={"": self.device},
61
- torch_dtype=torch.float32, # Use float32 for CPU
62
  token=Config.HUGGING_FACE_TOKEN,
63
  low_cpu_mem_usage=True
64
  )
 
1
  import logging
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
  import torch
4
  from huggingface_hub import login
5
  from .config import Config
 
54
  logger.info(f"Loading model: {self.model_name}")
55
  logger.info(f"Using device: {self.device}")
56
 
57
+ # Configure 4-bit quantization
58
+ quantization_config = BitsAndBytesConfig(
59
+ load_in_4bit=True,
60
+ bnb_4bit_compute_dtype=torch.float16,
61
+ bnb_4bit_use_double_quant=True,
62
+ bnb_4bit_quant_type="nf4"
63
+ )
64
+
65
+ # Load model with 4-bit quantization
66
  self.model = AutoModelForCausalLM.from_pretrained(
67
  self.model_name,
68
  device_map={"": self.device},
69
+ quantization_config=quantization_config,
70
  token=Config.HUGGING_FACE_TOKEN,
71
  low_cpu_mem_usage=True
72
  )