Spaces:
Sleeping
Sleeping
Joash
commited on
Commit
·
a307172
1
Parent(s):
5f0bb6b
Add comprehensive memory optimizations for model and Docker
Browse files- Dockerfile +16 -6
- src/model_manager.py +31 -7
Dockerfile
CHANGED
@@ -12,8 +12,8 @@ RUN apt-get update && apt-get install -y \
|
|
12 |
&& rm -rf /var/lib/apt/lists/*
|
13 |
|
14 |
# Create necessary directories with proper permissions
|
15 |
-
RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local \
|
16 |
-
&& chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local
|
17 |
|
18 |
# Create non-root user
|
19 |
RUN useradd -m -u 1000 user \
|
@@ -29,8 +29,18 @@ ENV HF_HOME=/home/user/.cache/huggingface
|
|
29 |
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
30 |
# Set logging to stdout
|
31 |
ENV LOG_FILE=/dev/stdout
|
32 |
-
#
|
33 |
ENV MALLOC_ARENA_MAX=2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Switch to non-root user
|
36 |
USER user
|
@@ -42,7 +52,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
|
|
42 |
# Copy requirements first to leverage Docker cache
|
43 |
COPY --chown=user:user requirements.txt .
|
44 |
|
45 |
-
# Install Python dependencies with
|
46 |
RUN pip install --user --no-cache-dir -r requirements.txt
|
47 |
|
48 |
# Copy application code
|
@@ -51,5 +61,5 @@ COPY --chown=user:user . .
|
|
51 |
# Expose port for Hugging Face Spaces
|
52 |
EXPOSE 7860
|
53 |
|
54 |
-
# Run the application with
|
55 |
-
CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]
|
|
|
12 |
&& rm -rf /var/lib/apt/lists/*
|
13 |
|
14 |
# Create necessary directories with proper permissions
|
15 |
+
RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local /app/offload \
|
16 |
+
&& chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local /app/offload
|
17 |
|
18 |
# Create non-root user
|
19 |
RUN useradd -m -u 1000 user \
|
|
|
29 |
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
30 |
# Set logging to stdout
|
31 |
ENV LOG_FILE=/dev/stdout
|
32 |
+
# Memory optimizations
|
33 |
ENV MALLOC_ARENA_MAX=2
|
34 |
+
ENV MALLOC_TRIM_THRESHOLD_=100000
|
35 |
+
ENV MALLOC_MMAP_THRESHOLD_=100000
|
36 |
+
# Transformers optimizations
|
37 |
+
ENV TRANSFORMERS_OFFLINE=1
|
38 |
+
ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
|
39 |
+
ENV CUDA_LAUNCH_BLOCKING=1
|
40 |
+
# Model optimizations
|
41 |
+
ENV OMP_NUM_THREADS=1
|
42 |
+
ENV MKL_NUM_THREADS=1
|
43 |
+
ENV NUMEXPR_NUM_THREADS=1
|
44 |
|
45 |
# Switch to non-root user
|
46 |
USER user
|
|
|
52 |
# Copy requirements first to leverage Docker cache
|
53 |
COPY --chown=user:user requirements.txt .
|
54 |
|
55 |
+
# Install Python dependencies with memory optimizations
|
56 |
RUN pip install --user --no-cache-dir -r requirements.txt
|
57 |
|
58 |
# Copy application code
|
|
|
61 |
# Expose port for Hugging Face Spaces
|
62 |
EXPOSE 7860
|
63 |
|
64 |
+
# Run the application with memory optimizations
|
65 |
+
CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1", "--limit-concurrency", "1", "--timeout-keep-alive", "120"]
|
src/model_manager.py
CHANGED
@@ -33,7 +33,8 @@ class ModelManager:
|
|
33 |
logger.info(f"Loading tokenizer: {self.model_name}")
|
34 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
35 |
self.model_name,
|
36 |
-
token=Config.HUGGING_FACE_TOKEN
|
|
|
37 |
)
|
38 |
# Ensure we have the necessary special tokens
|
39 |
special_tokens = {
|
@@ -62,14 +63,22 @@ class ModelManager:
|
|
62 |
bnb_4bit_quant_type="nf4"
|
63 |
)
|
64 |
|
65 |
-
# Load model with
|
66 |
self.model = AutoModelForCausalLM.from_pretrained(
|
67 |
self.model_name,
|
68 |
device_map={"": self.device},
|
69 |
quantization_config=quantization_config,
|
70 |
token=Config.HUGGING_FACE_TOKEN,
|
71 |
-
low_cpu_mem_usage=True
|
|
|
|
|
|
|
|
|
72 |
)
|
|
|
|
|
|
|
|
|
73 |
# Resize embeddings to match tokenizer
|
74 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
75 |
logger.info("Model loaded successfully")
|
@@ -78,18 +87,24 @@ class ModelManager:
|
|
78 |
logger.error(f"Error loading model: {str(e)}")
|
79 |
raise
|
80 |
|
81 |
-
def generate_text(self, prompt: str, max_new_tokens: int =
|
82 |
"""Generate text from prompt."""
|
83 |
try:
|
84 |
logger.info("Starting text generation")
|
85 |
logger.debug(f"Prompt length: {len(prompt)}")
|
86 |
|
87 |
-
# Encode the prompt
|
88 |
-
inputs = self.tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
90 |
logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
|
91 |
|
92 |
-
# Generate response
|
93 |
logger.info("Generating response")
|
94 |
with torch.no_grad():
|
95 |
outputs = self.model.generate(
|
@@ -100,8 +115,15 @@ class ModelManager:
|
|
100 |
top_p=Config.TOP_P,
|
101 |
pad_token_id=self.tokenizer.pad_token_id,
|
102 |
eos_token_id=self.tokenizer.eos_token_id,
|
|
|
|
|
|
|
103 |
)
|
104 |
|
|
|
|
|
|
|
|
|
105 |
# Decode and return the generated text
|
106 |
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
107 |
response = generated_text[len(prompt):].strip()
|
@@ -113,4 +135,6 @@ class ModelManager:
|
|
113 |
except Exception as e:
|
114 |
logger.error(f"Error generating text: {str(e)}")
|
115 |
logger.error(f"Error details: {type(e).__name__}")
|
|
|
|
|
116 |
raise
|
|
|
33 |
logger.info(f"Loading tokenizer: {self.model_name}")
|
34 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
35 |
self.model_name,
|
36 |
+
token=Config.HUGGING_FACE_TOKEN,
|
37 |
+
model_max_length=1024 # Limit max length to save memory
|
38 |
)
|
39 |
# Ensure we have the necessary special tokens
|
40 |
special_tokens = {
|
|
|
63 |
bnb_4bit_quant_type="nf4"
|
64 |
)
|
65 |
|
66 |
+
# Load model with memory optimizations
|
67 |
self.model = AutoModelForCausalLM.from_pretrained(
|
68 |
self.model_name,
|
69 |
device_map={"": self.device},
|
70 |
quantization_config=quantization_config,
|
71 |
token=Config.HUGGING_FACE_TOKEN,
|
72 |
+
low_cpu_mem_usage=True,
|
73 |
+
torch_dtype=torch.float16, # Use fp16 for additional memory savings
|
74 |
+
max_memory={0: "4GB"}, # Limit memory usage
|
75 |
+
offload_folder="offload", # Enable CPU offloading
|
76 |
+
use_cache=False # Disable KV cache to save memory
|
77 |
)
|
78 |
+
|
79 |
+
# Enable gradient checkpointing
|
80 |
+
self.model.gradient_checkpointing_enable()
|
81 |
+
|
82 |
# Resize embeddings to match tokenizer
|
83 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
84 |
logger.info("Model loaded successfully")
|
|
|
87 |
logger.error(f"Error loading model: {str(e)}")
|
88 |
raise
|
89 |
|
90 |
+
def generate_text(self, prompt: str, max_new_tokens: int = 512) -> str:
|
91 |
"""Generate text from prompt."""
|
92 |
try:
|
93 |
logger.info("Starting text generation")
|
94 |
logger.debug(f"Prompt length: {len(prompt)}")
|
95 |
|
96 |
+
# Encode the prompt with reduced max length
|
97 |
+
inputs = self.tokenizer(
|
98 |
+
prompt,
|
99 |
+
return_tensors="pt",
|
100 |
+
truncation=True,
|
101 |
+
max_length=512, # Reduced max length
|
102 |
+
padding=True
|
103 |
+
)
|
104 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
105 |
logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
|
106 |
|
107 |
+
# Generate response with memory optimizations
|
108 |
logger.info("Generating response")
|
109 |
with torch.no_grad():
|
110 |
outputs = self.model.generate(
|
|
|
115 |
top_p=Config.TOP_P,
|
116 |
pad_token_id=self.tokenizer.pad_token_id,
|
117 |
eos_token_id=self.tokenizer.eos_token_id,
|
118 |
+
num_beams=1, # Disable beam search to save memory
|
119 |
+
use_cache=False, # Disable KV cache
|
120 |
+
early_stopping=True
|
121 |
)
|
122 |
|
123 |
+
# Clear CUDA cache after generation
|
124 |
+
if torch.cuda.is_available():
|
125 |
+
torch.cuda.empty_cache()
|
126 |
+
|
127 |
# Decode and return the generated text
|
128 |
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
129 |
response = generated_text[len(prompt):].strip()
|
|
|
135 |
except Exception as e:
|
136 |
logger.error(f"Error generating text: {str(e)}")
|
137 |
logger.error(f"Error details: {type(e).__name__}")
|
138 |
+
if torch.cuda.is_available():
|
139 |
+
torch.cuda.empty_cache()
|
140 |
raise
|