Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from transformers.utils import is_flash_attn_2_available | |
from transformers import BitsAndBytesConfig | |
import pandas as pd | |
import os | |
import torch | |
import numpy as np | |
from scipy import sparse | |
from sklearn.metrics.pairwise import cosine_similarity | |
from scipy import sparse | |
# CHOOSE DEVICE | |
model_id = 'google/gemma-2b-it' | |
HF_TOKEN = os.environ['HF_TOKEN'] | |
def load_model(model_id) : | |
print(torch.backends.mps.is_available()) | |
device = torch.device("mps") if torch.backends.mps.is_available() else "cpu" | |
print(device) | |
if device=='cpu' : | |
print('Warning! No GPU available') | |
# IMPORT MODEL | |
print(model_id) | |
quantization_config = BitsAndBytesConfig(load_in_4bit=True, | |
bnb_4bit_compute_dtype=torch.float16) | |
# if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8): | |
# attn_implementation = "flash_attention_2" | |
# else: | |
# attn_implementation = "sdpa" | |
# print(f"[INFO] Using attention implementation: {attn_implementation}") | |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, token=HF_TOKEN) | |
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, | |
token=HF_TOKEN, | |
torch_dtype=torch.float16, | |
#quantization_config=quantization_config if quantization_config else None, | |
low_cpu_mem_usage=False,) # use full memory | |
#attn_implementation=attn_implementation) # which attention version to use | |
llm_model.to(device) | |
return llm_model, tokenizer, device | |
# Create a text element and let the reader know the data is loading. | |
model_load_state = st.text('Loading model...') | |
# Load 10,000 rows of data into the dataframe. | |
llm_model, tokenizer, device = load_model(model_id) | |
# Notify the reader that the data was successfully loaded. | |
model_load_state.text('Loading model...done!') | |
# INFERENCE | |
# def prompt_formatter(reviews, type_of_doc): | |
# return f"""You are a summarization bot. | |
# You will receive {type_of_doc} and you will extract all relevant information from {type_of_doc} and return one paragraph in which you will summarize what was said. | |
# {type_of_doc} are listed below under inputs. | |
# Inputs: {reviews} | |
# Answer : | |
# """ | |
def prompt_formatter(reviews, type_of_doc): | |
return f"""You are a summarization bot. | |
You will receive {type_of_doc} and you will summarize what was said in the input. | |
{type_of_doc} are listed below under inputs. | |
Inputs: {reviews} | |
Answer : | |
""" | |
def mirror_mirror(inputs, prompt_formatter, tokenizer, type_of_doc): | |
prompt = prompt_formatter(inputs, type_of_doc) | |
input_ids = tokenizer(prompt, return_tensors="pt").to(device) | |
outputs = llm_model.generate(**input_ids, | |
temperature=0.3, | |
do_sample=True, | |
max_new_tokens=275) | |
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return prompt, output_text.replace(prompt, '') | |
def summarization(example : list[str], type_of_doc : str, results_df : pd.DataFrame = pd.DataFrame()) -> pd.DataFrame : | |
print(type_of_doc) | |
# INFERENCE | |
results = [] | |
for cnt in range(0,5) : | |
print(cnt) | |
prompt, result = mirror_mirror(example, prompt_formatter, tokenizer, type_of_doc) | |
list_temp = [result, example] | |
tokenized = tokenizer(list_temp, return_tensors="pt", padding = True) | |
A = tokenized.input_ids.numpy() | |
A = sparse.csr_matrix(A) | |
score = cosine_similarity(A)[0,1] | |
#print(cosine_similarity(A)[0,1]) | |
#print(cosine_similarity(A)[1,0]) | |
if score>0.1 : | |
fin_result = result | |
max_score = score | |
break | |
results.append(result) | |
#print(result+'\n\n') | |
# tokenize results and example together | |
try : | |
fin_result | |
except : | |
# if fin_result not already defined, use the best of available results | |
# add example to results so tokenization is done together (due to padding limitations) | |
results.append(example) | |
tokenized = tokenizer(results, return_tensors="pt", padding = True) | |
A = tokenized.input_ids.numpy() | |
A = sparse.csr_matrix(A) | |
# calculate cosine similarity of each pair | |
# keep only example X result column | |
scores = cosine_similarity(A)[:,5] | |
# final result is the one with greaters cos_score | |
fin_result = results[np.argmax(scores)] | |
max_score = max(scores) | |
#print(fin_result) | |
# save final result and its attributes | |
row = pd.DataFrame({'model' : model_id, 'prompt' : prompt, 'reviews' : example, 'summarization' : fin_result, 'score' : [max_score] }) | |
results_df = pd.concat([results_df,row], ignore_index = True) | |
return results_df | |
# adding the text that will show in the text box as default | |
default_value = "I am a summarization bot! Let me summarize your reading for you!" | |
st.title("Mirror, mirror, on the cloud, what do Clockify users say aloud?") | |
st.subheader("--Clockify review summarizer--") | |
inputs = st.text_area("Your text", default_value, height = 275) | |
type_of_doc = st.text_area("Type of text", 'text', height = 25) | |
button = st.button('Summon the summarizer!') | |
result = '' | |
score = '' | |
if button : | |
results_df = summarization(inputs,type_of_doc) | |
# only one input | |
result = results_df.summarization[0] | |
score = results_df.score[0] | |
outputs = st.text_area("Summarized text", result) | |
score = st.text_area("Cosine similarity score", score) | |
# max_length = st.sidebar.slider("Max Length", min_value = 10, max_value=30) | |
# temperature = st.sidebar.slider("Temperature", value = 1.0, min_value = 0.0, max_value=1.0, step=0.05) | |
# top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 0) | |
# top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.9) | |
# num_return_sequences = st.sidebar.number_input('Number of Return Sequences', min_value=1, max_value=5, value=1, step=1) |