Spaces:

Joash2024
/

code-review-assistant

Sleeping

App Files Files Community

Joash commited on Dec 8, 2024

Commit

a307172

1 Parent(s): 5f0bb6b

Add comprehensive memory optimizations for model and Docker

Browse files

Files changed (2) hide show

Dockerfile +16 -6
src/model_manager.py +31 -7

Dockerfile CHANGED Viewed

@@ -12,8 +12,8 @@ RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
 # Create necessary directories with proper permissions
-RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local \
-    && chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local
 # Create non-root user
 RUN useradd -m -u 1000 user \
@@ -29,8 +29,18 @@ ENV HF_HOME=/home/user/.cache/huggingface
 ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
 # Set logging to stdout
 ENV LOG_FILE=/dev/stdout
-# Reduce memory usage
 ENV MALLOC_ARENA_MAX=2
 # Switch to non-root user
 USER user
@@ -42,7 +52,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
 # Copy requirements first to leverage Docker cache
 COPY --chown=user:user requirements.txt .
-# Install Python dependencies with reduced memory usage
 RUN pip install --user --no-cache-dir -r requirements.txt
 # Copy application code
@@ -51,5 +61,5 @@ COPY --chown=user:user . .
 # Expose port for Hugging Face Spaces
 EXPOSE 7860
-# Run the application with reduced memory usage
-CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]

     && rm -rf /var/lib/apt/lists/*
 # Create necessary directories with proper permissions
+RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local /app/offload \
+    && chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local /app/offload
 # Create non-root user
 RUN useradd -m -u 1000 user \
 ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
 # Set logging to stdout
 ENV LOG_FILE=/dev/stdout
+# Memory optimizations
 ENV MALLOC_ARENA_MAX=2
+ENV MALLOC_TRIM_THRESHOLD_=100000
+ENV MALLOC_MMAP_THRESHOLD_=100000
+# Transformers optimizations
+ENV TRANSFORMERS_OFFLINE=1
+ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
+ENV CUDA_LAUNCH_BLOCKING=1
+# Model optimizations
+ENV OMP_NUM_THREADS=1
+ENV MKL_NUM_THREADS=1
+ENV NUMEXPR_NUM_THREADS=1
 # Switch to non-root user
 USER user
 # Copy requirements first to leverage Docker cache
 COPY --chown=user:user requirements.txt .
+# Install Python dependencies with memory optimizations
 RUN pip install --user --no-cache-dir -r requirements.txt
 # Copy application code
 # Expose port for Hugging Face Spaces
 EXPOSE 7860
+# Run the application with memory optimizations
+CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1", "--limit-concurrency", "1", "--timeout-keep-alive", "120"]

src/model_manager.py CHANGED Viewed

@@ -33,7 +33,8 @@ class ModelManager:
             logger.info(f"Loading tokenizer: {self.model_name}")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
-                token=Config.HUGGING_FACE_TOKEN
             )
             # Ensure we have the necessary special tokens
             special_tokens = {
@@ -62,14 +63,22 @@ class ModelManager:
                 bnb_4bit_quant_type="nf4"
             )
-            # Load model with 4-bit quantization
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map={"": self.device},
                 quantization_config=quantization_config,
                 token=Config.HUGGING_FACE_TOKEN,
-                low_cpu_mem_usage=True
             )
             # Resize embeddings to match tokenizer
             self.model.resize_token_embeddings(len(self.tokenizer))
             logger.info("Model loaded successfully")
@@ -78,18 +87,24 @@ class ModelManager:
             logger.error(f"Error loading model: {str(e)}")
             raise
-    def generate_text(self, prompt: str, max_new_tokens: int = 1024) -> str:
         """Generate text from prompt."""
         try:
             logger.info("Starting text generation")
             logger.debug(f"Prompt length: {len(prompt)}")
-            # Encode the prompt
-            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
-            # Generate response
             logger.info("Generating response")
             with torch.no_grad():
                 outputs = self.model.generate(
@@ -100,8 +115,15 @@ class ModelManager:
                     top_p=Config.TOP_P,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                 )
             # Decode and return the generated text
             generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             response = generated_text[len(prompt):].strip()
@@ -113,4 +135,6 @@ class ModelManager:
         except Exception as e:
             logger.error(f"Error generating text: {str(e)}")
             logger.error(f"Error details: {type(e).__name__}")
             raise

             logger.info(f"Loading tokenizer: {self.model_name}")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
+                token=Config.HUGGING_FACE_TOKEN,
+                model_max_length=1024  # Limit max length to save memory
             )
             # Ensure we have the necessary special tokens
             special_tokens = {
                 bnb_4bit_quant_type="nf4"
             )
+            # Load model with memory optimizations
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map={"": self.device},
                 quantization_config=quantization_config,
                 token=Config.HUGGING_FACE_TOKEN,
+                low_cpu_mem_usage=True,
+                torch_dtype=torch.float16,  # Use fp16 for additional memory savings
+                max_memory={0: "4GB"},  # Limit memory usage
+                offload_folder="offload",  # Enable CPU offloading
+                use_cache=False  # Disable KV cache to save memory
             )
+            # Enable gradient checkpointing
+            self.model.gradient_checkpointing_enable()
             # Resize embeddings to match tokenizer
             self.model.resize_token_embeddings(len(self.tokenizer))
             logger.info("Model loaded successfully")
             logger.error(f"Error loading model: {str(e)}")
             raise
+    def generate_text(self, prompt: str, max_new_tokens: int = 512) -> str:
         """Generate text from prompt."""
         try:
             logger.info("Starting text generation")
             logger.debug(f"Prompt length: {len(prompt)}")
+            # Encode the prompt with reduced max length
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,  # Reduced max length
+                padding=True
+            )
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
+            # Generate response with memory optimizations
             logger.info("Generating response")
             with torch.no_grad():
                 outputs = self.model.generate(
                     top_p=Config.TOP_P,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
+                    num_beams=1,  # Disable beam search to save memory
+                    use_cache=False,  # Disable KV cache
+                    early_stopping=True
                 )
+            # Clear CUDA cache after generation
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             # Decode and return the generated text
             generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             response = generated_text[len(prompt):].strip()
         except Exception as e:
             logger.error(f"Error generating text: {str(e)}")
             logger.error(f"Error details: {type(e).__name__}")
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             raise