Joash commited on
Commit
a307172
·
1 Parent(s): 5f0bb6b

Add comprehensive memory optimizations for model and Docker

Browse files
Files changed (2) hide show
  1. Dockerfile +16 -6
  2. src/model_manager.py +31 -7
Dockerfile CHANGED
@@ -12,8 +12,8 @@ RUN apt-get update && apt-get install -y \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Create necessary directories with proper permissions
15
- RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local \
16
- && chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local
17
 
18
  # Create non-root user
19
  RUN useradd -m -u 1000 user \
@@ -29,8 +29,18 @@ ENV HF_HOME=/home/user/.cache/huggingface
29
  ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
30
  # Set logging to stdout
31
  ENV LOG_FILE=/dev/stdout
32
- # Reduce memory usage
33
  ENV MALLOC_ARENA_MAX=2
 
 
 
 
 
 
 
 
 
 
34
 
35
  # Switch to non-root user
36
  USER user
@@ -42,7 +52,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
42
  # Copy requirements first to leverage Docker cache
43
  COPY --chown=user:user requirements.txt .
44
 
45
- # Install Python dependencies with reduced memory usage
46
  RUN pip install --user --no-cache-dir -r requirements.txt
47
 
48
  # Copy application code
@@ -51,5 +61,5 @@ COPY --chown=user:user . .
51
  # Expose port for Hugging Face Spaces
52
  EXPOSE 7860
53
 
54
- # Run the application with reduced memory usage
55
- CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]
 
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Create necessary directories with proper permissions
15
+ RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local /app/offload \
16
+ && chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local /app/offload
17
 
18
  # Create non-root user
19
  RUN useradd -m -u 1000 user \
 
29
  ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
30
  # Set logging to stdout
31
  ENV LOG_FILE=/dev/stdout
32
+ # Memory optimizations
33
  ENV MALLOC_ARENA_MAX=2
34
+ ENV MALLOC_TRIM_THRESHOLD_=100000
35
+ ENV MALLOC_MMAP_THRESHOLD_=100000
36
+ # Transformers optimizations
37
+ ENV TRANSFORMERS_OFFLINE=1
38
+ ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
39
+ ENV CUDA_LAUNCH_BLOCKING=1
40
+ # Model optimizations
41
+ ENV OMP_NUM_THREADS=1
42
+ ENV MKL_NUM_THREADS=1
43
+ ENV NUMEXPR_NUM_THREADS=1
44
 
45
  # Switch to non-root user
46
  USER user
 
52
  # Copy requirements first to leverage Docker cache
53
  COPY --chown=user:user requirements.txt .
54
 
55
+ # Install Python dependencies with memory optimizations
56
  RUN pip install --user --no-cache-dir -r requirements.txt
57
 
58
  # Copy application code
 
61
  # Expose port for Hugging Face Spaces
62
  EXPOSE 7860
63
 
64
+ # Run the application with memory optimizations
65
+ CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1", "--limit-concurrency", "1", "--timeout-keep-alive", "120"]
src/model_manager.py CHANGED
@@ -33,7 +33,8 @@ class ModelManager:
33
  logger.info(f"Loading tokenizer: {self.model_name}")
34
  self.tokenizer = AutoTokenizer.from_pretrained(
35
  self.model_name,
36
- token=Config.HUGGING_FACE_TOKEN
 
37
  )
38
  # Ensure we have the necessary special tokens
39
  special_tokens = {
@@ -62,14 +63,22 @@ class ModelManager:
62
  bnb_4bit_quant_type="nf4"
63
  )
64
 
65
- # Load model with 4-bit quantization
66
  self.model = AutoModelForCausalLM.from_pretrained(
67
  self.model_name,
68
  device_map={"": self.device},
69
  quantization_config=quantization_config,
70
  token=Config.HUGGING_FACE_TOKEN,
71
- low_cpu_mem_usage=True
 
 
 
 
72
  )
 
 
 
 
73
  # Resize embeddings to match tokenizer
74
  self.model.resize_token_embeddings(len(self.tokenizer))
75
  logger.info("Model loaded successfully")
@@ -78,18 +87,24 @@ class ModelManager:
78
  logger.error(f"Error loading model: {str(e)}")
79
  raise
80
 
81
- def generate_text(self, prompt: str, max_new_tokens: int = 1024) -> str:
82
  """Generate text from prompt."""
83
  try:
84
  logger.info("Starting text generation")
85
  logger.debug(f"Prompt length: {len(prompt)}")
86
 
87
- # Encode the prompt
88
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
 
 
 
 
 
 
89
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
90
  logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
91
 
92
- # Generate response
93
  logger.info("Generating response")
94
  with torch.no_grad():
95
  outputs = self.model.generate(
@@ -100,8 +115,15 @@ class ModelManager:
100
  top_p=Config.TOP_P,
101
  pad_token_id=self.tokenizer.pad_token_id,
102
  eos_token_id=self.tokenizer.eos_token_id,
 
 
 
103
  )
104
 
 
 
 
 
105
  # Decode and return the generated text
106
  generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
107
  response = generated_text[len(prompt):].strip()
@@ -113,4 +135,6 @@ class ModelManager:
113
  except Exception as e:
114
  logger.error(f"Error generating text: {str(e)}")
115
  logger.error(f"Error details: {type(e).__name__}")
 
 
116
  raise
 
33
  logger.info(f"Loading tokenizer: {self.model_name}")
34
  self.tokenizer = AutoTokenizer.from_pretrained(
35
  self.model_name,
36
+ token=Config.HUGGING_FACE_TOKEN,
37
+ model_max_length=1024 # Limit max length to save memory
38
  )
39
  # Ensure we have the necessary special tokens
40
  special_tokens = {
 
63
  bnb_4bit_quant_type="nf4"
64
  )
65
 
66
+ # Load model with memory optimizations
67
  self.model = AutoModelForCausalLM.from_pretrained(
68
  self.model_name,
69
  device_map={"": self.device},
70
  quantization_config=quantization_config,
71
  token=Config.HUGGING_FACE_TOKEN,
72
+ low_cpu_mem_usage=True,
73
+ torch_dtype=torch.float16, # Use fp16 for additional memory savings
74
+ max_memory={0: "4GB"}, # Limit memory usage
75
+ offload_folder="offload", # Enable CPU offloading
76
+ use_cache=False # Disable KV cache to save memory
77
  )
78
+
79
+ # Enable gradient checkpointing
80
+ self.model.gradient_checkpointing_enable()
81
+
82
  # Resize embeddings to match tokenizer
83
  self.model.resize_token_embeddings(len(self.tokenizer))
84
  logger.info("Model loaded successfully")
 
87
  logger.error(f"Error loading model: {str(e)}")
88
  raise
89
 
90
+ def generate_text(self, prompt: str, max_new_tokens: int = 512) -> str:
91
  """Generate text from prompt."""
92
  try:
93
  logger.info("Starting text generation")
94
  logger.debug(f"Prompt length: {len(prompt)}")
95
 
96
+ # Encode the prompt with reduced max length
97
+ inputs = self.tokenizer(
98
+ prompt,
99
+ return_tensors="pt",
100
+ truncation=True,
101
+ max_length=512, # Reduced max length
102
+ padding=True
103
+ )
104
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
105
  logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
106
 
107
+ # Generate response with memory optimizations
108
  logger.info("Generating response")
109
  with torch.no_grad():
110
  outputs = self.model.generate(
 
115
  top_p=Config.TOP_P,
116
  pad_token_id=self.tokenizer.pad_token_id,
117
  eos_token_id=self.tokenizer.eos_token_id,
118
+ num_beams=1, # Disable beam search to save memory
119
+ use_cache=False, # Disable KV cache
120
+ early_stopping=True
121
  )
122
 
123
+ # Clear CUDA cache after generation
124
+ if torch.cuda.is_available():
125
+ torch.cuda.empty_cache()
126
+
127
  # Decode and return the generated text
128
  generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
129
  response = generated_text[len(prompt):].strip()
 
135
  except Exception as e:
136
  logger.error(f"Error generating text: {str(e)}")
137
  logger.error(f"Error details: {type(e).__name__}")
138
+ if torch.cuda.is_available():
139
+ torch.cuda.empty_cache()
140
  raise