Update handler.py
Browse files- handler.py +3 -3
handler.py
CHANGED
@@ -7,7 +7,7 @@ os.environ['FORCE_CMAKE'] = "1"
|
|
7 |
import sys
|
8 |
import subprocess
|
9 |
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
|
10 |
-
'llama-cpp-python
|
11 |
|
12 |
from llama_cpp import Llama
|
13 |
|
@@ -22,13 +22,13 @@ cpu_count: int = int(_) if _ else 1
|
|
22 |
|
23 |
|
24 |
|
25 |
-
MAX_INPUT_TOKEN_LENGTH =
|
26 |
MAX_MAX_NEW_TOKENS = 2048
|
27 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
28 |
|
29 |
class EndpointHandler():
|
30 |
def __init__(self, path=""):
|
31 |
-
self.llm = Llama(model_path="/repository/iubaris-13b-
|
32 |
|
33 |
def get_input_token_length(self, message: str) -> int:
|
34 |
input_ids = self.model([message.encode('utf-8')])
|
|
|
7 |
import sys
|
8 |
import subprocess
|
9 |
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
|
10 |
+
'llama-cpp-python'])
|
11 |
|
12 |
from llama_cpp import Llama
|
13 |
|
|
|
22 |
|
23 |
|
24 |
|
25 |
+
MAX_INPUT_TOKEN_LENGTH = 3072
|
26 |
MAX_MAX_NEW_TOKENS = 2048
|
27 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
28 |
|
29 |
class EndpointHandler():
|
30 |
def __init__(self, path=""):
|
31 |
+
self.llm = Llama(model_path="/repository/iubaris-13b-v3_ggml_Q4_K_M.gguf", n_ctx=MAX_INPUT_TOKEN_LENGTH, n_gpu_layers=50, n_threads=cpu_count, verbose=True)
|
32 |
|
33 |
def get_input_token_length(self, message: str) -> int:
|
34 |
input_ids = self.model([message.encode('utf-8')])
|