Spaces:
Runtime error
Runtime error
import html | |
import os | |
from typing import AnyStr | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
import streamlit as st | |
import validators | |
from transformers import pipeline | |
from validators import ValidationFailure | |
def main() -> None: | |
nltk.download("punkt") | |
# header | |
st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:") | |
st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and " | |
"difficult to understand. ") | |
st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that " | |
"we have prepared for you, then you will see the summary represented as the most important sentences.") | |
st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following " | |
"GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:") | |
st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::") | |
st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, " | |
"so we suggest a careful reading of the document.") | |
def create_pipeline(): | |
with st.spinner("Loading the model..."): | |
tos_pipeline = pipeline(task="summarization", | |
model="ML-unipi/bart-large-tos", | |
tokenizer="ML-unipi/bart-large-tos" | |
) | |
return tos_pipeline | |
def display_summary(summary_sentences: list) -> None: | |
st.subheader("Summary :male-detective:") | |
for sentence in summary_sentences: | |
st.markdown(f"<li>{sentence['summary_text']}</li>", unsafe_allow_html=True) | |
def get_list_files() -> list: | |
names = [] | |
for file in os.listdir("./samples/"): | |
if file.endswith(".txt"): | |
names.append(file.replace(".txt", "")) | |
return names | |
def fetch_file_content(filename: str) -> AnyStr: | |
with open(f"./samples/{filename.lower()}.txt", "r") as file: | |
text = file.read() | |
return text | |
if "target_text" not in st.session_state: | |
st.session_state.target_text = "" | |
if "sentence_lenght" not in st.session_state: | |
st.session_state.sentence_length = 15 | |
if "sample_choice" not in st.session_state: | |
st.session_state.sentence_length = "" | |
st.header("Input") | |
# sentences_length = st.number_input( | |
# label="How many senetences to be extracted:", | |
# min_value=5, | |
# max_value=15, | |
# step=1, | |
# value=st.session_state.sentence_length | |
# ) | |
sample_choice = st.selectbox( | |
label="Select a sample:", | |
options=get_list_files() | |
) | |
st.session_state.target_text = fetch_file_content(sample_choice) | |
target_text_input = st.text_area( | |
value=st.session_state.target_text, | |
label="Paste your own Term Of Service:", | |
height=240 | |
) | |
summarize_button = st.button(label="Try it!") | |
# @st.cache(suppress_st_warning=True, | |
# show_spinner=False, | |
# allow_output_mutation=True, | |
# hash_funcs={"torch.nn.parameter.Parameter": lambda _: None, | |
# "tokenizers.Tokenizer": lambda _: None, | |
# "tokenizers.AddedToken": lambda _: None, | |
# } | |
# ) | |
# def summary_from_cache(summary_sentence: tuple) -> tuple: | |
# with st.spinner("Summarizing in progress..."): | |
# return tuple(summarizer.abstractive_summary(list(summary_sentence))) | |
def join_sentences(sentences: list) -> str: | |
return " ".join([sentence for sentence in sentences]) | |
def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list: | |
accumulated_lists = [] | |
result_list = [] | |
cumulative_token_length = 0 | |
for sentence in sentences: | |
token_list = [token for token in nltk.word_tokenize(sentence) if token not in ['.']] | |
token_length = len(token_list) | |
if token_length + cumulative_token_length > split_token_length and result_list: | |
accumulated_lists.append(join_sentences(result_list)) | |
result_list = [sentence] | |
cumulative_token_length = token_length | |
else: | |
result_list.append(sentence) | |
cumulative_token_length += token_length | |
if result_list: | |
accumulated_lists.append(join_sentences(result_list)) | |
return accumulated_lists | |
pipe = create_pipeline() | |
if summarize_button: | |
if target_text_input is not "": | |
with st.spinner("Summarizing in progress..."): | |
sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input), 600) | |
for sentence in sentences: | |
summary_sentences = pipe(sentence) | |
display_summary(summary_sentences["summary_text"].split(".")) | |
#output = pipe(sentence) | |
#st.markdown(output["summary_text"]) | |
if __name__ == "__main__": | |
main() | |