import torch import streamlit as st from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, AutoModelForCausalLM st.title('How does the LLM choose its words?') model_checkpoint = "google/flan-t5-small" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = T5ForConditionalGeneration.from_pretrained( model_checkpoint, load_in_8bit=False, device_map="auto" ) instruction = st.text_area('Write an instruction:') max_tokens = st.number_input('Max output length: ', min_value=1, max_value=64, format='%i') prompts = [ f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" ] inputs = tokenizer( prompts[0], return_tensors="pt", ) input_ids = inputs["input_ids"]#.to("cuda") generation_config = GenerationConfig( do_sample=True, temperature=0.8, # default 0.1 top_p=0.995, # default 0.75 top_k=100, # default 80 repetition_penalty=1.5, max_new_tokens=max_tokens, ) if instruction: with torch.no_grad(): outputs = model.generate( input_ids=input_ids, attention_mask=torch.ones_like(input_ids), generation_config=generation_config, return_dict_in_generate=True, output_scores=True ) output_text = tokenizer.decode( outputs['sequences'][0],#.cuda(), skip_special_tokens=False ).strip() st.write(output_text)