Spaces:

nananie143
/

q25COTapi

Sleeping

App Files Files Community

nananie143 commited on Jan 12

Commit

d582d65

verified ·

1 Parent(s): 5becbcb

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -7

app.py CHANGED Viewed

@@ -11,11 +11,8 @@ from pydantic import BaseModel
 import uvicorn
 import time
 from threading import Lock
-import requests
 from pathlib import Path
-from tqdm import tqdm
-from contextlib import asynccontextmanager
-from huggingface_hub import hf_hub_download
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -28,6 +25,21 @@ class ChatCompletionRequest(BaseModel):
     max_tokens: Optional[int] = 2048
     stream: Optional[bool] = False
 def download_model_from_hf():
     """Download the model file from Hugging Face."""
     try:
@@ -37,10 +49,14 @@ def download_model_from_hf():
         model_dir = Path("models")
         model_dir.mkdir(exist_ok=True)
         # Download the model using huggingface_hub
         local_path = hf_hub_download(
             repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
-            filename="model.gguf",
             local_dir=model_dir,
             local_dir_use_symlinks=False
         )
@@ -67,11 +83,15 @@ class QwenModel:
             n_gpu_layers = 40 if self.has_gpu else 0
             logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
             self.llm = LlamaCpp(
                 model_path=str(model_path),
                 n_gpu_layers=n_gpu_layers,
-                n_ctx=4096,
-                n_batch=512 if self.has_gpu else 128,
                 verbose=True,
                 temperature=0.7,
                 max_tokens=2048,
@@ -80,6 +100,9 @@ class QwenModel:
                 f16_kv=self.has_gpu,
                 use_mlock=True,
                 use_mmap=True,
             )
             # Thread lock for concurrent API requests

 import uvicorn
 import time
 from threading import Lock
 from pathlib import Path
+from huggingface_hub import hf_hub_download, list_repo_files
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     max_tokens: Optional[int] = 2048
     stream: Optional[bool] = False
+def get_model_filename():
+    """Get the correct model filename from the repository."""
+    try:
+        logger.info("Listing repository files...")
+        files = list_repo_files("G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF")
+        # Filter for GGUF files
+        gguf_files = [f for f in files if f.endswith('.gguf')]
+        if not gguf_files:
+            raise ValueError("No GGUF model files found in repository")
+        logger.info(f"Found model files: {gguf_files}")
+        return gguf_files[0]
+    except Exception as e:
+        logger.error(f"Error listing repository files: {str(e)}")
+        raise
 def download_model_from_hf():
     """Download the model file from Hugging Face."""
     try:
         model_dir = Path("models")
         model_dir.mkdir(exist_ok=True)
+        # Get the correct filename
+        model_filename = get_model_filename()
+        logger.info(f"Using model file: {model_filename}")
         # Download the model using huggingface_hub
         local_path = hf_hub_download(
             repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
+            filename=model_filename,
             local_dir=model_dir,
             local_dir_use_symlinks=False
         )
             n_gpu_layers = 40 if self.has_gpu else 0
             logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
+            # Adjust memory settings for CPU
+            n_batch = 512 if self.has_gpu else 64  # Reduced batch size for CPU
+            n_ctx = 2048 if not self.has_gpu else 4096  # Reduced context for CPU
             self.llm = LlamaCpp(
                 model_path=str(model_path),
                 n_gpu_layers=n_gpu_layers,
+                n_ctx=n_ctx,
+                n_batch=n_batch,
                 verbose=True,
                 temperature=0.7,
                 max_tokens=2048,
                 f16_kv=self.has_gpu,
                 use_mlock=True,
                 use_mmap=True,
+                seed=42,  # For reproducibility
+                repeat_penalty=1.1,  # Prevent repetitive outputs
+                rope_scaling={"type": "linear", "factor": 1.0},  # RoPE scaling for better long-context handling
             )
             # Thread lock for concurrent API requests