Spaces:

nananie143
/

q25COTapi

Sleeping

App Files Files Community

nananie143 commited on Jan 12

Commit

05661ec

verified ·

1 Parent(s): 627e1c6

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -40

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from langchain_community.llms import LlamaCpp  # Updated import
 import os
 import json
 import torch
@@ -15,6 +15,7 @@ import requests
 from pathlib import Path
 from tqdm import tqdm
 from contextlib import asynccontextmanager
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -27,33 +28,30 @@ class ChatCompletionRequest(BaseModel):
     max_tokens: Optional[int] = 2048
     stream: Optional[bool] = False
-def download_model(model_url: str, local_path: Path) -> Path:
-    """Download the model file if it doesn't exist locally."""
-    if local_path.exists():
-        logger.info(f"Model already exists at {local_path}")
-        return local_path
-    logger.info(f"Downloading model from {model_url}")
-    local_path.parent.mkdir(parents=True, exist_ok=True)
-    response = requests.get(model_url, stream=True)
-    total_size = int(response.headers.get('content-length', 0))
-    with open(local_path, 'wb') as file, tqdm(
-        desc=local_path.name,
-        total=total_size,
-        unit='iB',
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as pbar:
-        for data in response.iter_content(chunk_size=1024):
-            size = file.write(data)
-            pbar.update(size)
-    return local_path
 class QwenModel:
-    def __init__(self, model_path: str):
         """Initialize the Qwen model with automatic device detection."""
         try:
             # Check for GPU availability
@@ -61,12 +59,9 @@ class QwenModel:
             self.device_count = torch.cuda.device_count() if self.has_gpu else 0
             logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
-            # Ensure model path exists
-            model_path = Path(model_path)
-            if not model_path.exists():
-                # If model doesn't exist locally, download it
-                model_url = "https://huggingface.co/G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF/resolve/main/model.gguf"
-                model_path = download_model(model_url, model_path)
             # Configure model parameters based on available hardware
             n_gpu_layers = 40 if self.has_gpu else 0
@@ -76,13 +71,13 @@ class QwenModel:
                 model_path=str(model_path),
                 n_gpu_layers=n_gpu_layers,
                 n_ctx=4096,
-                n_batch=512 if self.has_gpu else 128,  # Reduced batch size for CPU
                 verbose=True,
                 temperature=0.7,
                 max_tokens=2048,
                 top_p=0.95,
                 top_k=50,
-                f16_kv=self.has_gpu,  # Only use f16 when GPU is available
                 use_mlock=True,
                 use_mmap=True,
             )
@@ -107,8 +102,7 @@ async def lifespan(app: FastAPI):
     """Lifespan context manager for FastAPI startup and shutdown events."""
     global model
     try:
-        model_path = Path("models/qwen-2.5-14b-gguf")
-        model = QwenModel(model_path)
         logger.info("Model initialized successfully")
         yield
     finally:
@@ -117,18 +111,16 @@ async def lifespan(app: FastAPI):
 app = FastAPI(lifespan=lifespan)
-# ... [rest of the FastAPI routes and main function remain the same] ...
 def main():
     """Main function to initialize and launch the application."""
     try:
         global model
-        # Model path
-        model_path = Path("models/qwen-2.5-14b-gguf")
         # Initialize the model if not already initialized
         if model is None:
-            model = QwenModel(model_path)
         # Create and launch the Gradio interface
         interface = create_gradio_interface(model)

 import gradio as gr
+from langchain_community.llms import LlamaCpp
 import os
 import json
 import torch
 from pathlib import Path
 from tqdm import tqdm
 from contextlib import asynccontextmanager
+from huggingface_hub import hf_hub_download
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     max_tokens: Optional[int] = 2048
     stream: Optional[bool] = False
+def download_model_from_hf():
+    """Download the model file from Hugging Face."""
+    try:
+        logger.info("Downloading model from Hugging Face Hub...")
+        # Create models directory if it doesn't exist
+        model_dir = Path("models")
+        model_dir.mkdir(exist_ok=True)
+        # Download the model using huggingface_hub
+        local_path = hf_hub_download(
+            repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
+            filename="model.gguf",
+            local_dir=model_dir,
+            local_dir_use_symlinks=False
+        )
+        return Path(local_path)
+    except Exception as e:
+        logger.error(f"Error downloading model: {str(e)}")
+        raise
 class QwenModel:
+    def __init__(self):
         """Initialize the Qwen model with automatic device detection."""
         try:
             # Check for GPU availability
             self.device_count = torch.cuda.device_count() if self.has_gpu else 0
             logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
+            # Download or get the model
+            model_path = download_model_from_hf()
+            logger.info(f"Model path: {model_path}")
             # Configure model parameters based on available hardware
             n_gpu_layers = 40 if self.has_gpu else 0
                 model_path=str(model_path),
                 n_gpu_layers=n_gpu_layers,
                 n_ctx=4096,
+                n_batch=512 if self.has_gpu else 128,
                 verbose=True,
                 temperature=0.7,
                 max_tokens=2048,
                 top_p=0.95,
                 top_k=50,
+                f16_kv=self.has_gpu,
                 use_mlock=True,
                 use_mmap=True,
             )
     """Lifespan context manager for FastAPI startup and shutdown events."""
     global model
     try:
+        model = QwenModel()
         logger.info("Model initialized successfully")
         yield
     finally:
 app = FastAPI(lifespan=lifespan)
+# ... [rest of the FastAPI routes remain the same] ...
 def main():
     """Main function to initialize and launch the application."""
     try:
         global model
         # Initialize the model if not already initialized
         if model is None:
+            model = QwenModel()
         # Create and launch the Gradio interface
         interface = create_gradio_interface(model)