Spaces:
Running
Running
File size: 4,281 Bytes
6f595b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st
import pandas as pd
from transformers import pipeline
from stqdm import stqdm
from simplet5 import SimpleT5
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
@st.cache
def load_t5():
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
return model, tokenizer
@st.cache(allow_output_mutation=False)
def custom_model():
return pipeline("summarization", model="my_awesome_sum/")
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode("utf-8")
@st.cache
def load_one_line_summarizer(model):
return model.load_model("t5", "snrspeaks/t5-one-line-summary")
st.set_page_config(layout="wide", page_title="Amazon Review Summarizer")
st.title("Amazon Review Summarizer")
uploaded_file = st.file_uploader("Choose a file", type=["xlsx", "xls", "csv"])
summarizer_option = st.selectbox(
"Select Summarizer",
("Custom trained on the dataset", "t5-base", "t5-one-line-summary"),
)
ps = st.empty()
if st.button("Process"):
if uploaded_file is not None:
df = pd.read_excel(uploaded_file)
columns = df.columns.values.tolist()
columns = [x.lower() for x in columns]
df.columns = columns
print(summarizer_option)
if summarizer_option == "Custom trained on the dataset":
model = custom_model()
print(summarizer_option)
text = df["text"].values.tolist()
progress_text = "Summarization in progress. Please wait."
summary = []
for x in stqdm(range(len(text))):
try:
summary.append(
model(
f"summarize: {text[x]}", max_length=50, early_stopping=True
)[0]["summary_text"]
)
except:
pass
output = pd.DataFrame(
{"text": df["text"].values.tolist(), "summary": summary}
)
csv = convert_df(output)
st.download_button(
label="Download data as CSV",
data=csv,
file_name=f"{summarizer_option}_df.csv",
mime="text/csv",
)
if summarizer_option == "t5-base":
model, tokenizer = load_t5()
text = df["text"].values.tolist()
summary = []
for x in stqdm(range(10)):
tokens_input = tokenizer.encode(
"summarize: " + text[x],
return_tensors="pt",
max_length=tokenizer.model_max_length,
truncation=True,
)
summary_ids = model.generate(
tokens_input,
min_length=80,
max_length=150,
length_penalty=20,
num_beams=2,
)
summary_gen = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary.append(summary_gen)
output = pd.DataFrame(
{"text": df["text"].values.tolist()[0:10], "summary": summary}
)
csv = convert_df(output)
st.download_button(
label="Download data as CSV",
data=csv,
file_name=f"{summarizer_option}_df.csv",
mime="text/csv",
)
if summarizer_option == "t5-one-line-summary":
model = SimpleT5()
text = df["text"].values.tolist()
load_one_line_summarizer(model=model)
summary = []
for x in stqdm(range(10)):
try:
summary.append(model.predict(text[x])[0])
except:
pass
output = pd.DataFrame(
{"text": df["text"].values.tolist()[0:10], "summary": summary}
)
csv = convert_df(output)
st.download_button(
label="Download data as CSV",
data=csv,
file_name=f"{summarizer_option}_df.csv",
mime="text/csv",
)
|