llm-token-probs / app.py
patpizio's picture
Update app.py
be3d74a
raw
history blame
1.88 kB
import torch
import streamlit as st
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, AutoModelForCausalLM
st.title('How do LLM choose their words?')
col1, col2 = st.columns(2)
with col1:
model_checkpoint = st.selectbox(
"Model:",
("google/flan-t5-base", "google/flan-t5-xl")
)
with col2:
temperature = st.number_input('Temperature: ', min_value=0.0, max_value=1.0, value=0.5, format='%f')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(
model_checkpoint,
load_in_8bit=False,
device_map="auto"
)
instruction = st.text_area('Write an instruction:')
max_tokens = st.number_input('Max output length: ', min_value=1, max_value=64, format='%i')
prompts = [
f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction: {instruction}
### Response:"""
]
inputs = tokenizer(
prompts[0],
return_tensors="pt",
)
input_ids = inputs["input_ids"]#.to("cuda")
generation_config = GenerationConfig(
do_sample=True,
temperature=temperature,
top_p=0.995, # default 0.75
top_k=100, # default 80
repetition_penalty=1.5,
max_new_tokens=max_tokens,
)
if instruction:
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids,
attention_mask=torch.ones_like(input_ids),
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True
)
output_text = tokenizer.decode(
outputs['sequences'][0],#.cuda(),
skip_special_tokens=False
).strip()
st.write(output_text)
st.write(model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=False))