import torch import streamlit as st from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, AutoModelForCausalLM st.title('How does the LLM choose its words?') model_checkpoint = st.selectbox( "Model:", ("google/flan-t5-base", "google/flan-t5-xl"), label_visibility=st.session_state.visibility, disabled=st.session_state.disabled, ) temperature = st.number_input('Temperature: ', min_value=0, max_value=1, value=0.5, format='%f') tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = T5ForConditionalGeneration.from_pretrained( model_checkpoint, load_in_8bit=False, device_map="auto" ) instruction = st.text_area('Write an instruction:') max_tokens = st.number_input('Max output length: ', min_value=1, max_value=64, format='%i') prompts = [ f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" ] inputs = tokenizer( prompts[0], return_tensors="pt", ) input_ids = inputs["input_ids"]#.to("cuda") generation_config = GenerationConfig( do_sample=True, temperature=temperature, top_p=0.995, # default 0.75 top_k=100, # default 80 repetition_penalty=1.5, max_new_tokens=max_tokens, ) if instruction: with torch.no_grad(): outputs = model.generate( input_ids=input_ids, attention_mask=torch.ones_like(input_ids), generation_config=generation_config, return_dict_in_generate=True, output_scores=True ) output_text = tokenizer.decode( outputs['sequences'][0],#.cuda(), skip_special_tokens=False ).strip() st.write(output_text)