File size: 1,549 Bytes
757dddf
 
 
 
 
 
364ca27
757dddf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364ca27
 
 
757dddf
 
 
364ca27
757dddf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from ctransformers import AutoModelForCausalLM, AutoTokenizer
from loguru import logger
import os


def models():
    return ["mistral-7b-openorca.Q5_K_M.gguf"]


def load():
    # model = AutoModelForCausalLM.from_pretrained("TheBloke/OpenHermes-2.5-Mistral-7B-GGUF", model_file="openhermes-2.5-mistral-7b.Q4_K_M.gguf", model_type="mistral", gpu_layers=0, hf=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_path_or_repo_id="TheBloke/Mistral-7B-OpenOrca-GGUF",
        model_file="mistral-7b-openorca.Q5_K_M.gguf",
        model_type="mistral",
        hf=True,
        temperature=0.7,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1.2,
        context_length=32768,
        max_new_tokens=2048,
        threads=os.cpu_count(),
        stream=True,
        gpu_layers=0
    )

    tokenizer = AutoTokenizer.from_pretrained(model)
    return (model, tokenizer)


model, tokenizer = load()


def ask(_, system_prompt, pre_prompt, question):
    messages = [
        {'role': 'system', 'content': f"{system_prompt} {pre_prompt}", },
        {'role': 'user', 'content': f"{question}", },
    ]
    logger.debug(f"<< transformers << {messages}")
    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    # inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

    outputs = model.generate(inputs, max_length=200)
    answer = tokenizer.batch_decode(outputs)[0]
    logger.debug(f">> transformers >> {answer}")
    return answer