local-llm-2 / utils /epfl_meditron_utils.py
Robin Genolet
first commit
74044e0
raw
history blame
1.1 kB
from ctransformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import streamlit as st
# Simple inference example
# output = llm(
# "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", # Prompt
# max_tokens=512, # Generate up to 512 tokens
# stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
# echo=True # Whether to echo the prompt
#)
prompt_format = "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant"
def get_llm_response(repo, filename, model_type, gpu_layers, prompt):
print("Loading model")
model = AutoModelForCausalLM.from_pretrained(repo, model_file=filename, model_type=model_type, gpu_layers=gpu_layers)
print("Model loaded")
#llm_prompt = prompt_format.format(system_message=system_prompt, prompt=prompt)
print(f"LLM prompt: {prompt}")
response = model(prompt, stop=["</s>"])
print(f"Response: {response}")
return response