Spaces:

Joash2024
/

code-review-assistant

Running

Joash commited on Dec 8, 2024

Commit

5f0bb6b

1 Parent(s): 1f37a6a

Optimize memory usage with 4-bit quantization and Docker settings

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -4,10 +4,11 @@ FROM python:3.11-slim
 # Set working directory
 WORKDIR /app
-# Install system dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     && rm -rf /var/lib/apt/lists/*
 # Create necessary directories with proper permissions
@@ -28,6 +29,8 @@ ENV HF_HOME=/home/user/.cache/huggingface
 ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
 # Set logging to stdout
 ENV LOG_FILE=/dev/stdout
 # Switch to non-root user
 USER user
@@ -39,7 +42,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
 # Copy requirements first to leverage Docker cache
 COPY --chown=user:user requirements.txt .
-# Install Python dependencies
 RUN pip install --user --no-cache-dir -r requirements.txt
 # Copy application code
@@ -48,5 +51,5 @@ COPY --chown=user:user . .
 # Expose port for Hugging Face Spaces
 EXPOSE 7860
-# Run the application with logging
-CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug"]

 # Set working directory
 WORKDIR /app
+# Install system dependencies including cuda-toolkit for bitsandbytes
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
+    git \
     && rm -rf /var/lib/apt/lists/*
 # Create necessary directories with proper permissions
 ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
 # Set logging to stdout
 ENV LOG_FILE=/dev/stdout
+# Reduce memory usage
+ENV MALLOC_ARENA_MAX=2
 # Switch to non-root user
 USER user
 # Copy requirements first to leverage Docker cache
 COPY --chown=user:user requirements.txt .
+# Install Python dependencies with reduced memory usage
 RUN pip install --user --no-cache-dir -r requirements.txt
 # Copy application code
 # Expose port for Hugging Face Spaces
 EXPOSE 7860
+# Run the application with reduced memory usage
+CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]

src/model_manager.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 import torch
 from huggingface_hub import login
 from .config import Config
@@ -54,11 +54,19 @@ class ModelManager:
             logger.info(f"Loading model: {self.model_name}")
             logger.info(f"Using device: {self.device}")
-            # Load model with CPU configuration
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map={"": self.device},
-                torch_dtype=torch.float32,  # Use float32 for CPU
                 token=Config.HUGGING_FACE_TOKEN,
                 low_cpu_mem_usage=True
             )

 import logging
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 from huggingface_hub import login
 from .config import Config
             logger.info(f"Loading model: {self.model_name}")
             logger.info(f"Using device: {self.device}")
+            # Configure 4-bit quantization
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+            # Load model with 4-bit quantization
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map={"": self.device},
+                quantization_config=quantization_config,
                 token=Config.HUGGING_FACE_TOKEN,
                 low_cpu_mem_usage=True
             )