import streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.utils import is_flash_attn_2_available from transformers import BitsAndBytesConfig import pandas as pd import os import torch import numpy as np from scipy import sparse from sklearn.metrics.pairwise import cosine_similarity from scipy import sparse # CHOOSE DEVICE model_id = 'google/gemma-2b-it' HF_TOKEN = os.environ['HF_TOKEN'] @st.cache_resource def load_model(model_id) : print(torch.backends.mps.is_available()) device = torch.device("gpu") if torch.backends.mps.is_available() else "cpu" print(device) if device=='cpu' : print('Warning! No GPU available') # IMPORT MODEL print(model_id) quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) # if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8): # attn_implementation = "flash_attention_2" # else: # attn_implementation = "sdpa" # print(f"[INFO] Using attention implementation: {attn_implementation}") tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, token=HF_TOKEN) llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, token=HF_TOKEN, torch_dtype=torch.float16, #quantization_config=quantization_config if quantization_config else None, low_cpu_mem_usage=False,) # use full memory #attn_implementation=attn_implementation) # which attention version to use llm_model.to(device) return llm_model, tokenizer, device # Create a text element and let the reader know the data is loading. model_load_state = st.text('Loading model...') # Load 10,000 rows of data into the dataframe. llm_model, tokenizer, device = load_model(model_id) # Notify the reader that the data was successfully loaded. model_load_state.text('Loading model...done!') # INFERENCE # def prompt_formatter(reviews, type_of_doc): # return f"""You are a summarization bot. # You will receive {type_of_doc} and you will extract all relevant information from {type_of_doc} and return one paragraph in which you will summarize what was said. # {type_of_doc} are listed below under inputs. # Inputs: {reviews} # Answer : # """ def prompt_formatter(reviews, type_of_doc): return f"""You are a summarization bot. You will receive {type_of_doc} and you will summarize what was said in the input. {type_of_doc} are listed below under inputs. Inputs: {reviews} Answer : """ def mirror_mirror(inputs, prompt_formatter, tokenizer, type_of_doc): print(inputs) prompt = prompt_formatter(inputs, type_of_doc) input_ids = tokenizer(prompt, return_tensors="pt").to(device) outputs = llm_model.generate(**input_ids, temperature=0.3, do_sample=True, max_new_tokens=275) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return prompt, output_text.replace(prompt, '') def summarization(example : str, type_of_doc : str, results_df : pd.DataFrame = pd.DataFrame()) -> pd.DataFrame : print(type_of_doc) # INFERENCE results = [] for cnt in range(0,5) : print(cnt) prompt, result = mirror_mirror(example, prompt_formatter, tokenizer, type_of_doc) list_temp = [result, example] tokenized = tokenizer(list_temp, return_tensors="pt", padding = True) A = tokenized.input_ids.numpy() A = sparse.csr_matrix(A) score = cosine_similarity(A)[0,1] print(score) #print(cosine_similarity(A)[0,1]) #print(cosine_similarity(A)[1,0]) if score>0.1 : fin_result = result max_score = score break results.append(result) #print(result+'\n\n') # tokenize results and example together try : fin_result except : # if fin_result not already defined, use the best of available results # add example to results so tokenization is done together (due to padding limitations) results.append(example) tokenized = tokenizer(results, return_tensors="pt", padding = True) A = tokenized.input_ids.numpy() A = sparse.csr_matrix(A) # calculate cosine similarity of each pair # keep only example X result column scores = cosine_similarity(A)[:,5] # final result is the one with greaters cos_score fin_result = results[np.argmax(scores)] max_score = max(scores) #print(fin_result) # save final result and its attributes row = pd.DataFrame({'model' : model_id, 'prompt' : prompt, 'reviews' : example, 'summarization' : fin_result, 'score' : [max_score] }) results_df = pd.concat([results_df,row], ignore_index = True) return results_df # adding the text that will show in the text box as default default_value = "The roar of the traffic, the passage of undifferentiated \ faces, this way and that way, drugs me into dreams; rubs the \ features from faces. People might walk through me. And what is \ this moment of time, this particular day in which I have found \ myself caught? The growl of traffic might be any uproar - forest trees or \ the roar of wild beasts. Time has whizzed back an inch or two on its reel; \ our short progress has been cancelled. I think also that our bodies are in truth \ naked. We are only lightly covered with buttoned cloth; and beneath these \ pavements are shells, bones and silence." st.title("Summarizer") inputs = st.text_area("Your text", default_value, height = 275) type_of_doc = st.text_area("Type of text", 'qoute from 'The Waves' by Virginia Woolf", height = 25) button = st.button('Summon the summarizer!') result = '' score = '' if button : results_df = summarization(inputs,type_of_doc) # only one input result = results_df.summarization[0] score = results_df.score[0] outputs = st.text_area("Summarized text", result) score = st.text_area("Cosine similarity score", score) # max_length = st.sidebar.slider("Max Length", min_value = 10, max_value=30) # temperature = st.sidebar.slider("Temperature", value = 1.0, min_value = 0.0, max_value=1.0, step=0.05) # top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 0) # top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.9) # num_return_sequences = st.sidebar.number_input('Number of Return Sequences', min_value=1, max_value=5, value=1, step=1)