Spaces:
Sleeping
Sleeping
Joash
commited on
Commit
·
69455b9
1
Parent(s):
b4ae3b7
Fix offline mode and improve model loading
Browse files- Dockerfile +3 -7
- src/model_manager.py +9 -9
Dockerfile
CHANGED
@@ -26,21 +26,17 @@ ENV PYTHONDONTWRITEBYTECODE=1
|
|
26 |
ENV PORT=7860
|
27 |
ENV PATH="/home/user/.local/bin:${PATH}"
|
28 |
ENV HF_HOME=/home/user/.cache/huggingface
|
29 |
-
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
30 |
-
# Set logging to stdout
|
31 |
-
ENV LOG_FILE=/dev/stdout
|
32 |
# Memory optimizations
|
33 |
ENV MALLOC_ARENA_MAX=2
|
34 |
ENV MALLOC_TRIM_THRESHOLD_=100000
|
35 |
ENV MALLOC_MMAP_THRESHOLD_=100000
|
36 |
-
# Transformers optimizations
|
37 |
-
ENV TRANSFORMERS_OFFLINE=1
|
38 |
-
ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
|
39 |
-
ENV CUDA_LAUNCH_BLOCKING=1
|
40 |
# Model optimizations
|
41 |
ENV OMP_NUM_THREADS=1
|
42 |
ENV MKL_NUM_THREADS=1
|
43 |
ENV NUMEXPR_NUM_THREADS=1
|
|
|
|
|
|
|
44 |
|
45 |
# Switch to non-root user
|
46 |
USER user
|
|
|
26 |
ENV PORT=7860
|
27 |
ENV PATH="/home/user/.local/bin:${PATH}"
|
28 |
ENV HF_HOME=/home/user/.cache/huggingface
|
|
|
|
|
|
|
29 |
# Memory optimizations
|
30 |
ENV MALLOC_ARENA_MAX=2
|
31 |
ENV MALLOC_TRIM_THRESHOLD_=100000
|
32 |
ENV MALLOC_MMAP_THRESHOLD_=100000
|
|
|
|
|
|
|
|
|
33 |
# Model optimizations
|
34 |
ENV OMP_NUM_THREADS=1
|
35 |
ENV MKL_NUM_THREADS=1
|
36 |
ENV NUMEXPR_NUM_THREADS=1
|
37 |
+
# Ensure offline mode is disabled
|
38 |
+
ENV HF_HUB_OFFLINE=0
|
39 |
+
ENV TRANSFORMERS_OFFLINE=0
|
40 |
|
41 |
# Switch to non-root user
|
42 |
USER user
|
src/model_manager.py
CHANGED
@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
3 |
import torch
|
4 |
from huggingface_hub import login
|
5 |
from .config import Config
|
|
|
6 |
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
@@ -13,11 +14,15 @@ class ModelManager:
|
|
13 |
self.model = None
|
14 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
|
|
|
|
|
|
|
|
|
16 |
# Login to Hugging Face Hub
|
17 |
if Config.HUGGING_FACE_TOKEN:
|
18 |
logger.info("Logging in to Hugging Face Hub")
|
19 |
try:
|
20 |
-
login(token=Config.HUGGING_FACE_TOKEN)
|
21 |
logger.info("Successfully logged in to Hugging Face Hub")
|
22 |
except Exception as e:
|
23 |
logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
|
@@ -34,7 +39,8 @@ class ModelManager:
|
|
34 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
35 |
self.model_name,
|
36 |
token=Config.HUGGING_FACE_TOKEN,
|
37 |
-
model_max_length=1024 # Limit max length to save memory
|
|
|
38 |
)
|
39 |
# Ensure we have the necessary special tokens
|
40 |
special_tokens = {
|
@@ -71,14 +77,8 @@ class ModelManager:
|
|
71 |
token=Config.HUGGING_FACE_TOKEN,
|
72 |
low_cpu_mem_usage=True,
|
73 |
torch_dtype=torch.float16, # Use fp16 for additional memory savings
|
74 |
-
|
75 |
-
offload_folder="offload", # Enable CPU offloading
|
76 |
-
use_cache=False # Disable KV cache to save memory
|
77 |
)
|
78 |
-
|
79 |
-
# Enable gradient checkpointing
|
80 |
-
self.model.gradient_checkpointing_enable()
|
81 |
-
|
82 |
# Resize embeddings to match tokenizer
|
83 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
84 |
logger.info("Model loaded successfully")
|
|
|
3 |
import torch
|
4 |
from huggingface_hub import login
|
5 |
from .config import Config
|
6 |
+
import os
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
|
|
14 |
self.model = None
|
15 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
|
17 |
+
# Ensure offline mode is disabled
|
18 |
+
os.environ['HF_HUB_OFFLINE'] = '0'
|
19 |
+
os.environ['TRANSFORMERS_OFFLINE'] = '0'
|
20 |
+
|
21 |
# Login to Hugging Face Hub
|
22 |
if Config.HUGGING_FACE_TOKEN:
|
23 |
logger.info("Logging in to Hugging Face Hub")
|
24 |
try:
|
25 |
+
login(token=Config.HUGGING_FACE_TOKEN, add_to_git_credential=False)
|
26 |
logger.info("Successfully logged in to Hugging Face Hub")
|
27 |
except Exception as e:
|
28 |
logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
|
|
|
39 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
40 |
self.model_name,
|
41 |
token=Config.HUGGING_FACE_TOKEN,
|
42 |
+
model_max_length=1024, # Limit max length to save memory
|
43 |
+
trust_remote_code=True
|
44 |
)
|
45 |
# Ensure we have the necessary special tokens
|
46 |
special_tokens = {
|
|
|
77 |
token=Config.HUGGING_FACE_TOKEN,
|
78 |
low_cpu_mem_usage=True,
|
79 |
torch_dtype=torch.float16, # Use fp16 for additional memory savings
|
80 |
+
trust_remote_code=True
|
|
|
|
|
81 |
)
|
|
|
|
|
|
|
|
|
82 |
# Resize embeddings to match tokenizer
|
83 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
84 |
logger.info("Model loaded successfully")
|