Spaces:
Paused
Paused
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import streamlit as st | |
def gptq_model_options(): | |
return [ | |
"TheBloke/Llama-2-7B-Chat-GPTQ", | |
"TheBloke/Llama-2-13B-chat-GPTQ", | |
"TheBloke/meditron-7B-GPTQ", | |
"TheBloke/meditron-70B-GPTQ", | |
] | |
def get_llm_response(model_name_or_path, temperature, do_sample, top_p, top_k, max_new_tokens, repetition_penalty, formatted_prompt): | |
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, | |
device_map="auto", | |
trust_remote_code=False, | |
revision="main") | |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
print("Formatted prompt:") | |
print(formatted_prompt) | |
st.session_state["llm_messages"].append(formatted_prompt) | |
#print("\n\n*** Generate:") | |
#input_ids = tokenizer(formatted_prompt, return_tensors='pt').input_ids.cuda() | |
#output = model.generate(inputs=input_ids, temperature=temperature, do_sample=do_sample, top_p=top_p, top_k=top_k, max_new_tokens=max_new_tokens) | |
#print(tokenizer.decode(output[0], skip_special_tokens=True)) | |
print("*** Pipeline:") | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_new_tokens=max_new_tokens, | |
do_sample=do_sample, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
repetition_penalty=repetition_penalty | |
) | |
pipe_response = pipe(formatted_prompt) | |
st.session_state["llm_messages"].append(pipe_response) | |
print(pipe_response) | |
return pipe_response[0]['generated_text'] |