nananie143 commited on
Commit
d582d65
·
verified ·
1 Parent(s): 5becbcb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -7
app.py CHANGED
@@ -11,11 +11,8 @@ from pydantic import BaseModel
11
  import uvicorn
12
  import time
13
  from threading import Lock
14
- import requests
15
  from pathlib import Path
16
- from tqdm import tqdm
17
- from contextlib import asynccontextmanager
18
- from huggingface_hub import hf_hub_download
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
@@ -28,6 +25,21 @@ class ChatCompletionRequest(BaseModel):
28
  max_tokens: Optional[int] = 2048
29
  stream: Optional[bool] = False
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def download_model_from_hf():
32
  """Download the model file from Hugging Face."""
33
  try:
@@ -37,10 +49,14 @@ def download_model_from_hf():
37
  model_dir = Path("models")
38
  model_dir.mkdir(exist_ok=True)
39
 
 
 
 
 
40
  # Download the model using huggingface_hub
41
  local_path = hf_hub_download(
42
  repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
43
- filename="model.gguf",
44
  local_dir=model_dir,
45
  local_dir_use_symlinks=False
46
  )
@@ -67,11 +83,15 @@ class QwenModel:
67
  n_gpu_layers = 40 if self.has_gpu else 0
68
  logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
69
 
 
 
 
 
70
  self.llm = LlamaCpp(
71
  model_path=str(model_path),
72
  n_gpu_layers=n_gpu_layers,
73
- n_ctx=4096,
74
- n_batch=512 if self.has_gpu else 128,
75
  verbose=True,
76
  temperature=0.7,
77
  max_tokens=2048,
@@ -80,6 +100,9 @@ class QwenModel:
80
  f16_kv=self.has_gpu,
81
  use_mlock=True,
82
  use_mmap=True,
 
 
 
83
  )
84
 
85
  # Thread lock for concurrent API requests
 
11
  import uvicorn
12
  import time
13
  from threading import Lock
 
14
  from pathlib import Path
15
+ from huggingface_hub import hf_hub_download, list_repo_files
 
 
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO)
 
25
  max_tokens: Optional[int] = 2048
26
  stream: Optional[bool] = False
27
 
28
+ def get_model_filename():
29
+ """Get the correct model filename from the repository."""
30
+ try:
31
+ logger.info("Listing repository files...")
32
+ files = list_repo_files("G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF")
33
+ # Filter for GGUF files
34
+ gguf_files = [f for f in files if f.endswith('.gguf')]
35
+ if not gguf_files:
36
+ raise ValueError("No GGUF model files found in repository")
37
+ logger.info(f"Found model files: {gguf_files}")
38
+ return gguf_files[0]
39
+ except Exception as e:
40
+ logger.error(f"Error listing repository files: {str(e)}")
41
+ raise
42
+
43
  def download_model_from_hf():
44
  """Download the model file from Hugging Face."""
45
  try:
 
49
  model_dir = Path("models")
50
  model_dir.mkdir(exist_ok=True)
51
 
52
+ # Get the correct filename
53
+ model_filename = get_model_filename()
54
+ logger.info(f"Using model file: {model_filename}")
55
+
56
  # Download the model using huggingface_hub
57
  local_path = hf_hub_download(
58
  repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
59
+ filename=model_filename,
60
  local_dir=model_dir,
61
  local_dir_use_symlinks=False
62
  )
 
83
  n_gpu_layers = 40 if self.has_gpu else 0
84
  logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
85
 
86
+ # Adjust memory settings for CPU
87
+ n_batch = 512 if self.has_gpu else 64 # Reduced batch size for CPU
88
+ n_ctx = 2048 if not self.has_gpu else 4096 # Reduced context for CPU
89
+
90
  self.llm = LlamaCpp(
91
  model_path=str(model_path),
92
  n_gpu_layers=n_gpu_layers,
93
+ n_ctx=n_ctx,
94
+ n_batch=n_batch,
95
  verbose=True,
96
  temperature=0.7,
97
  max_tokens=2048,
 
100
  f16_kv=self.has_gpu,
101
  use_mlock=True,
102
  use_mmap=True,
103
+ seed=42, # For reproducibility
104
+ repeat_penalty=1.1, # Prevent repetitive outputs
105
+ rope_scaling={"type": "linear", "factor": 1.0}, # RoPE scaling for better long-context handling
106
  )
107
 
108
  # Thread lock for concurrent API requests