kajdun
/

iubaris-13b-v3_GGML

Text Generation

Model card Files Files and versions Community

kajdun commited on Aug 28, 2023

Commit

0d80971

·

1 Parent(s): 0d585c4

Update handler.py

Files changed (1) hide show

handler.py +3 -3

handler.py CHANGED Viewed

@@ -7,7 +7,7 @@ os.environ['FORCE_CMAKE'] = "1"
 import sys
 import subprocess
 subprocess.check_call([sys.executable, '-m', 'pip', 'install',
-'llama-cpp-python==0.1.78'])
 from llama_cpp import Llama
@@ -22,13 +22,13 @@ cpu_count: int = int(_) if _ else 1
-MAX_INPUT_TOKEN_LENGTH  = 4000
 MAX_MAX_NEW_TOKENS      = 2048
 DEFAULT_MAX_NEW_TOKENS  = 1024
 class EndpointHandler():
     def __init__(self, path=""):
-        self.llm = Llama(model_path="/repository/iubaris-13b-v3_ggml_Q4_K_S.bin", n_ctx=4000, n_gpu_layers=50, n_threads=cpu_count, verbose=True)
     def get_input_token_length(self, message: str) -> int:
         input_ids = self.model([message.encode('utf-8')])

 import sys
 import subprocess
 subprocess.check_call([sys.executable, '-m', 'pip', 'install',
+'llama-cpp-python'])
 from llama_cpp import Llama
+MAX_INPUT_TOKEN_LENGTH  = 3072
 MAX_MAX_NEW_TOKENS      = 2048
 DEFAULT_MAX_NEW_TOKENS  = 1024
 class EndpointHandler():
     def __init__(self, path=""):
+        self.llm = Llama(model_path="/repository/iubaris-13b-v3_ggml_Q4_K_M.gguf", n_ctx=MAX_INPUT_TOKEN_LENGTH, n_gpu_layers=50, n_threads=cpu_count, verbose=True)
     def get_input_token_length(self, message: str) -> int:
         input_ids = self.model([message.encode('utf-8')])