Spaces:
Runtime error
Runtime error
Commit
·
fbe3ac9
1
Parent(s):
c4a98a3
add first example of app.py
Browse files- Summarizer.py +56 -0
- app.py +111 -0
Summarizer.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
|
3 |
+
from sumy.nlp.stemmers import Stemmer
|
4 |
+
from sumy.summarizers.lsa import LsaSummarizer
|
5 |
+
from sumy.utils import get_stop_words
|
6 |
+
from transformers import Pipeline
|
7 |
+
|
8 |
+
class Summarizer:
|
9 |
+
DEFAULT_LANGUAGE = "english"
|
10 |
+
DEFAULT_SENTENCE_LENGTH = 15
|
11 |
+
|
12 |
+
def __init__(self, pipeline: Pipeline):
|
13 |
+
self.pipeline = pipeline
|
14 |
+
stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
|
15 |
+
self.lsa_summarizer = LsaSummarizer(stemmer)
|
16 |
+
self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def sentence_list(summarized_sentences) -> list:
|
20 |
+
summarized_list = []
|
21 |
+
for sentence in summarized_sentences:
|
22 |
+
summarized_list.append(sentence._text)
|
23 |
+
return summarized_list
|
24 |
+
|
25 |
+
@staticmethod
|
26 |
+
def join_sentences(summarized_sentences: list) -> str:
|
27 |
+
return " ".join([sentence for sentence in summarized_sentences])
|
28 |
+
|
29 |
+
@staticmethod
|
30 |
+
def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
|
31 |
+
accumulated_list = []
|
32 |
+
result_list = []
|
33 |
+
cumulative_token_length = 0
|
34 |
+
for sentence in summary_sentences:
|
35 |
+
token_list = [token for token in nltk.word_tokenize(sentence) if token not in ["."]]
|
36 |
+
token_length = len(token_list)
|
37 |
+
if token_length + cumulative_token_length > split_token_length and result_list:
|
38 |
+
accumulated_list.append(Summarizer.join_sentences(result_list))
|
39 |
+
result_list = [sentence]
|
40 |
+
cumulative_token_length = token_length
|
41 |
+
else:
|
42 |
+
result_list.append(sentence)
|
43 |
+
cumulative_token_length += token_length
|
44 |
+
|
45 |
+
if result_list:
|
46 |
+
accumulated_list.append(Summarizer.join_sentences(result_list))
|
47 |
+
|
48 |
+
return accumulated_list
|
49 |
+
|
50 |
+
def abstractive_summary(self, summary_sentences: list) -> list:
|
51 |
+
wrapped_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=600)
|
52 |
+
summary_list = []
|
53 |
+
for result in self.pipeline(wrapped_sentences, min_length=32, max_length=512):
|
54 |
+
summary_list.append(result['summary_text'])
|
55 |
+
|
56 |
+
return summary_list
|
app.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
import os
|
3 |
+
from typing import AnyStr
|
4 |
+
|
5 |
+
import nltk
|
6 |
+
import streamlit as st
|
7 |
+
import validators
|
8 |
+
from transformers import pipeline
|
9 |
+
from validators import ValidationFailure
|
10 |
+
|
11 |
+
from Summarizer import Summarizer
|
12 |
+
|
13 |
+
|
14 |
+
def main() -> None:
|
15 |
+
nltk.download("punkt")
|
16 |
+
# header
|
17 |
+
st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
|
18 |
+
st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
|
19 |
+
"difficult to understand. ")
|
20 |
+
st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
|
21 |
+
"we have prepared for you, then you will see the summary represented as the most important sentences.")
|
22 |
+
st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
|
23 |
+
"GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
|
24 |
+
st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
|
25 |
+
st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
|
26 |
+
"so we suggest a careful reading of the document.")
|
27 |
+
|
28 |
+
@st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
|
29 |
+
def create_pipeline():
|
30 |
+
with st.spinner("Loading the model..."):
|
31 |
+
tos_pipeline = pipeline(task="summarization",
|
32 |
+
model="ML-unipi/bart-large-tos",
|
33 |
+
tokenizer="ML-unipi/bart-large-tos"
|
34 |
+
)
|
35 |
+
return tos_pipeline
|
36 |
+
|
37 |
+
def display_summary(summary_sentences: list) -> None:
|
38 |
+
st.subheader("Summary :male-detective:")
|
39 |
+
for senetence in summary_sentences:
|
40 |
+
st.markdown(f"<li>{senetence}</li>", unsafe_allow_html=True)
|
41 |
+
|
42 |
+
def is_valid_url(url: str) -> bool:
|
43 |
+
result = validators.url(url)
|
44 |
+
if isinstance(result, ValidationFailure):
|
45 |
+
return False
|
46 |
+
return True
|
47 |
+
|
48 |
+
def get_list_files() -> list:
|
49 |
+
names = []
|
50 |
+
for file in os.listdir("./samples/"):
|
51 |
+
if file.endswith(".txt"):
|
52 |
+
names.append(file.replace(".txt", ""))
|
53 |
+
|
54 |
+
return names
|
55 |
+
|
56 |
+
def fetch_file_content(filename: str) -> AnyStr:
|
57 |
+
with open(f"./samples/{filename.lower()}.txt", "r") as file:
|
58 |
+
text = file.read()
|
59 |
+
return text
|
60 |
+
|
61 |
+
summarizer: Summarizer = Summarizer(create_pipeline())
|
62 |
+
|
63 |
+
if "target_text" not in st.session_state:
|
64 |
+
st.session_state.target_text = ""
|
65 |
+
if "sentence_lenght" not in st.session_state:
|
66 |
+
st.session_state.sentence_length = Summarizer.DEFAULT_SENTENCE_LENGTH
|
67 |
+
if "sample_choice" not in st.session_state:
|
68 |
+
st.session_state.sentence_length = ""
|
69 |
+
|
70 |
+
st.header("Input")
|
71 |
+
|
72 |
+
sentences_length = st.number_input(
|
73 |
+
label="How many senetences to be extracted:",
|
74 |
+
min_value=5,
|
75 |
+
max_value=15,
|
76 |
+
value=st.session_state.sentence_length
|
77 |
+
)
|
78 |
+
sample_choice = st.selectbox(
|
79 |
+
label="Select a sample:",
|
80 |
+
options=get_list_files()
|
81 |
+
)
|
82 |
+
|
83 |
+
st.session_state.target_text = fetch_file_content(sample_choice)
|
84 |
+
target_text_input = st.text_area(
|
85 |
+
value=st.session_state.target_text,
|
86 |
+
label="Paste your own Term Of Service:",
|
87 |
+
height=240
|
88 |
+
)
|
89 |
+
|
90 |
+
summarize_button = st.button(label="Try it!")
|
91 |
+
|
92 |
+
@st.cache(suppress_st_warning=True,
|
93 |
+
show_spinner=False,
|
94 |
+
allow_output_mutation=True,
|
95 |
+
hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
|
96 |
+
"tokenizers.Tokenizer": lambda _: None,
|
97 |
+
"tokenizers.AddedToken": lambda _: None,
|
98 |
+
})
|
99 |
+
|
100 |
+
|
101 |
+
def summary_from_cache(summary_sentence: tuple) -> tuple:
|
102 |
+
with st.spinner("Summarizing in progress..."):
|
103 |
+
return tuple(summarizer.abstractive_summary(list(summary_sentence)))
|
104 |
+
|
105 |
+
if summarize_button:
|
106 |
+
output = pipeline(st.session_state.target_text)
|
107 |
+
output(output[0])
|
108 |
+
|
109 |
+
|
110 |
+
if __name__ == "__main__":
|
111 |
+
main()
|