Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,8 @@ from pydantic import BaseModel
|
|
11 |
import uvicorn
|
12 |
import time
|
13 |
from threading import Lock
|
14 |
-
import requests
|
15 |
from pathlib import Path
|
16 |
-
from
|
17 |
-
from contextlib import asynccontextmanager
|
18 |
-
from huggingface_hub import hf_hub_download
|
19 |
|
20 |
# Configure logging
|
21 |
logging.basicConfig(level=logging.INFO)
|
@@ -28,6 +25,21 @@ class ChatCompletionRequest(BaseModel):
|
|
28 |
max_tokens: Optional[int] = 2048
|
29 |
stream: Optional[bool] = False
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def download_model_from_hf():
|
32 |
"""Download the model file from Hugging Face."""
|
33 |
try:
|
@@ -37,10 +49,14 @@ def download_model_from_hf():
|
|
37 |
model_dir = Path("models")
|
38 |
model_dir.mkdir(exist_ok=True)
|
39 |
|
|
|
|
|
|
|
|
|
40 |
# Download the model using huggingface_hub
|
41 |
local_path = hf_hub_download(
|
42 |
repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
|
43 |
-
filename=
|
44 |
local_dir=model_dir,
|
45 |
local_dir_use_symlinks=False
|
46 |
)
|
@@ -67,11 +83,15 @@ class QwenModel:
|
|
67 |
n_gpu_layers = 40 if self.has_gpu else 0
|
68 |
logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
|
69 |
|
|
|
|
|
|
|
|
|
70 |
self.llm = LlamaCpp(
|
71 |
model_path=str(model_path),
|
72 |
n_gpu_layers=n_gpu_layers,
|
73 |
-
n_ctx=
|
74 |
-
n_batch=
|
75 |
verbose=True,
|
76 |
temperature=0.7,
|
77 |
max_tokens=2048,
|
@@ -80,6 +100,9 @@ class QwenModel:
|
|
80 |
f16_kv=self.has_gpu,
|
81 |
use_mlock=True,
|
82 |
use_mmap=True,
|
|
|
|
|
|
|
83 |
)
|
84 |
|
85 |
# Thread lock for concurrent API requests
|
|
|
11 |
import uvicorn
|
12 |
import time
|
13 |
from threading import Lock
|
|
|
14 |
from pathlib import Path
|
15 |
+
from huggingface_hub import hf_hub_download, list_repo_files
|
|
|
|
|
16 |
|
17 |
# Configure logging
|
18 |
logging.basicConfig(level=logging.INFO)
|
|
|
25 |
max_tokens: Optional[int] = 2048
|
26 |
stream: Optional[bool] = False
|
27 |
|
28 |
+
def get_model_filename():
|
29 |
+
"""Get the correct model filename from the repository."""
|
30 |
+
try:
|
31 |
+
logger.info("Listing repository files...")
|
32 |
+
files = list_repo_files("G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF")
|
33 |
+
# Filter for GGUF files
|
34 |
+
gguf_files = [f for f in files if f.endswith('.gguf')]
|
35 |
+
if not gguf_files:
|
36 |
+
raise ValueError("No GGUF model files found in repository")
|
37 |
+
logger.info(f"Found model files: {gguf_files}")
|
38 |
+
return gguf_files[0]
|
39 |
+
except Exception as e:
|
40 |
+
logger.error(f"Error listing repository files: {str(e)}")
|
41 |
+
raise
|
42 |
+
|
43 |
def download_model_from_hf():
|
44 |
"""Download the model file from Hugging Face."""
|
45 |
try:
|
|
|
49 |
model_dir = Path("models")
|
50 |
model_dir.mkdir(exist_ok=True)
|
51 |
|
52 |
+
# Get the correct filename
|
53 |
+
model_filename = get_model_filename()
|
54 |
+
logger.info(f"Using model file: {model_filename}")
|
55 |
+
|
56 |
# Download the model using huggingface_hub
|
57 |
local_path = hf_hub_download(
|
58 |
repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
|
59 |
+
filename=model_filename,
|
60 |
local_dir=model_dir,
|
61 |
local_dir_use_symlinks=False
|
62 |
)
|
|
|
83 |
n_gpu_layers = 40 if self.has_gpu else 0
|
84 |
logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
|
85 |
|
86 |
+
# Adjust memory settings for CPU
|
87 |
+
n_batch = 512 if self.has_gpu else 64 # Reduced batch size for CPU
|
88 |
+
n_ctx = 2048 if not self.has_gpu else 4096 # Reduced context for CPU
|
89 |
+
|
90 |
self.llm = LlamaCpp(
|
91 |
model_path=str(model_path),
|
92 |
n_gpu_layers=n_gpu_layers,
|
93 |
+
n_ctx=n_ctx,
|
94 |
+
n_batch=n_batch,
|
95 |
verbose=True,
|
96 |
temperature=0.7,
|
97 |
max_tokens=2048,
|
|
|
100 |
f16_kv=self.has_gpu,
|
101 |
use_mlock=True,
|
102 |
use_mmap=True,
|
103 |
+
seed=42, # For reproducibility
|
104 |
+
repeat_penalty=1.1, # Prevent repetitive outputs
|
105 |
+
rope_scaling={"type": "linear", "factor": 1.0}, # RoPE scaling for better long-context handling
|
106 |
)
|
107 |
|
108 |
# Thread lock for concurrent API requests
|