Spaces:
Runtime error
Runtime error
File size: 4,979 Bytes
fbe3ac9 f6ab2e2 6c91d37 fbe3ac9 6c91d37 fbe3ac9 cb047cb fbe3ac9 d09a554 42540ef fbe3ac9 6c91d37 fbe3ac9 87fbf70 69f90b2 87fbf70 f6ab2e2 69f90b2 c1aef33 795ee13 87fbf70 6d14e62 6e3a821 9c923b0 f6ab2e2 9c923b0 146d058 bf8859b 146d058 fbe3ac9 6e3a821 146d058 6d14e62 90fd7fd b9f5a52 6e9dcd9 6d14e62 42780ef cb047cb 42780ef fbe3ac9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import os
from typing import AnyStr
import nltk
import streamlit as st
from transformers import pipeline, AutoTokenizer
import re
def main() -> None:
# header
st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
"difficult to understand. ")
st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
"we have prepared for you, then you will see the summary represented as the most important sentences.")
st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
"GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
"so we suggest a careful reading of the document.")
@st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
def create_pipeline():
with st.spinner("Loading the model..."):
tos_pipeline = pipeline(task="summarization",
model="ML-unipi/bart-large-tos",
tokenizer="ML-unipi/bart-large-tos",
)
return tos_pipeline
def clean_summaries(text: str) -> list:
result = []
lines = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
for line in lines:
if line.find(".") != -1:
line = line.replace("..", ".")
result.append(line)
return result
def display_summary(summary_sentences: list) -> None:
st.subheader("Summary :male-detective:")
for sentence in summary_sentences:
st.markdown(f"<li>{sentence}</li>", unsafe_allow_html=True)
def get_list_files() -> list:
names = []
for file in os.listdir("./samples/"):
if file.endswith(".txt"):
names.append(file.replace(".txt", ""))
return names
def fetch_file_content(filename: str) -> AnyStr:
with open(f"./samples/{filename.lower()}.txt", "r", encoding="utf-8") as file:
text = file.read()
return text
def join_sentences(sentences: list) -> str:
return " ".join([sentence for sentence in sentences])
def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list:
accumulated_lists = []
result_list = []
cumulative_token_length = 0
for sentence in sentences:
token_list = tokenizer(sentence, max_length=1024, truncation=True)
token_length = len(token_list["input_ids"])
if token_length > 10:
if token_length + cumulative_token_length > split_token_length and result_list:
accumulated_lists.append(join_sentences(result_list))
result_list = [sentence]
cumulative_token_length = token_length
else:
result_list.append(sentence)
cumulative_token_length += token_length
if result_list:
accumulated_lists.append(join_sentences(result_list))
return accumulated_lists
nltk.download("punkt")
pipe = create_pipeline()
tokenizer = AutoTokenizer.from_pretrained("ML-unipi/bart-large-tos")
if "target_text" not in st.session_state:
st.session_state.target_text = ""
if "sample_choice" not in st.session_state:
st.session_state.sample_choice = ""
st.header("Input")
sample_choice = st.selectbox(
label="Select a sample:",
options=get_list_files()
)
st.session_state.target_text = fetch_file_content(sample_choice)
target_text_input = st.text_area(
value=st.session_state.target_text,
label="Paste your own Term Of Service:",
height=240
)
summarize_button = st.button(label="Try it!")
if summarize_button:
if target_text_input != "":
summary_sentences = []
with st.spinner("Summarizing in progress..."):
sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input, language="english"),
split_token_length=1024
)
for sentence in sentences:
output = pipe(sentence)
summary = output[0]["summary_text"]
summary_sentences += clean_summaries(summary)
display_summary(summary_sentences)
if __name__ == "__main__":
main()
|