local-llm-2 / utils /epfl_meditron_utils.py
Robin Genolet
feat: specify params
6ed9cc0
raw
history blame
1.59 kB
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
def gptq_model_options():
return [
"TheBloke/Llama-2-7B-Chat-GPTQ",
"TheBloke/Llama-2-13B-chat-GPTQ",
"TheBloke/meditron-7B-GPTQ",
"TheBloke/meditron-70B-GPTQ",
]
def get_llm_response(model_name_or_path, temperature, do_sample, top_p, top_k, max_new_tokens, repetition_penalty, formatted_prompt):
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
device_map="auto",
trust_remote_code=False,
revision="main")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
print("Formatted prompt:")
print(formatted_prompt)
print("\n\n*** Generate:")
input_ids = tokenizer(formatted_prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=temperature, do_sample=do_sample, top_p=top_p, top_k=top_k, max_new_tokens=max_new_tokens)
print(tokenizer.decode(output[0], skip_special_tokens=True))
print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
temperature=temperature,
top_p=top_p,
top_k=top_p,
repetition_penalty=repetition_penalty
)
response = pipe(formatted_prompt)[0]['generated_text']
print(response)
return response