llm-token-probs / app.py
patpizio's picture
Update app.py
31df035
raw
history blame
1.88 kB
import torch
import streamlit as st
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, AutoModelForCausalLM
st.title('How do LLM choose their words?')
col1, col2 = st.columns(2)
with col1:
model_checkpoint = st.selectbox(
"Model:",
("google/flan-t5-base", "google/flan-t5-xl")
)
with col2:
temperature = st.number_input('Temperature: ', min_value=0.0, max_value=1.0, value=0.5, format='%f')
max_tokens = st.number_input('Max output length: ', min_value=1, max_value=64, format='%i')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(
model_checkpoint,
load_in_8bit=False,
device_map="auto"
)
instruction = st.text_area('Write an instruction:')
prompts = [
f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction: {instruction}
### Response:"""
]
inputs = tokenizer(
prompts[0],
return_tensors="pt",
)
input_ids = inputs["input_ids"]#.to("cuda")
generation_config = GenerationConfig(
do_sample=True,
temperature=temperature,
top_p=0.995, # default 0.75
top_k=100, # default 80
repetition_penalty=1.5,
max_new_tokens=max_tokens,
)
if instruction:
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids,
attention_mask=torch.ones_like(input_ids),
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True
)
output_text = tokenizer.decode(
outputs['sequences'][0],#.cuda(),
skip_special_tokens=False
).strip()
st.write(output_text)
model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=False)