Spaces:

ML-unipi
/

TermsOfServiceSummarization

Runtime error

App Files Files Community

TermsOfServiceSummarization / app.py

tommasobaldi

working on text splitting

d09a554 almost 3 years ago

raw

history blame

5.4 kB

	import html
	import os
	from typing import AnyStr

	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.tokenize import word_tokenize
	import streamlit as st
	import validators
	from transformers import pipeline
	from validators import ValidationFailure


	def main() -> None:
	nltk.download("punkt")
	# header
	st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
	st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
	"difficult to understand. ")
	st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
	"we have prepared for you, then you will see the summary represented as the most important sentences.")
	st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
	"GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
	st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
	st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
	"so we suggest a careful reading of the document.")

	@st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
	def create_pipeline():
	with st.spinner("Loading the model..."):
	tos_pipeline = pipeline(task="summarization",
	model="ML-unipi/bart-large-tos",
	tokenizer="ML-unipi/bart-large-tos"
	)
	return tos_pipeline

	def display_summary(summary_sentences: list) -> None:
	st.subheader("Summary :male-detective:")
	for sentence in summary_sentences:
	st.markdown(f"<li>{sentence['summary_text']}</li>", unsafe_allow_html=True)

	def get_list_files() -> list:
	names = []
	for file in os.listdir("./samples/"):
	if file.endswith(".txt"):
	names.append(file.replace(".txt", ""))

	return names

	def fetch_file_content(filename: str) -> AnyStr:
	with open(f"./samples/{filename.lower()}.txt", "r") as file:
	text = file.read()
	return text

	if "target_text" not in st.session_state:
	st.session_state.target_text = ""
	if "sentence_lenght" not in st.session_state:
	st.session_state.sentence_length = 15
	if "sample_choice" not in st.session_state:
	st.session_state.sentence_length = ""

	st.header("Input")

	# sentences_length = st.number_input(
	# label="How many senetences to be extracted:",
	# min_value=5,
	# max_value=15,
	# step=1,
	# value=st.session_state.sentence_length
	# )

	sample_choice = st.selectbox(
	label="Select a sample:",
	options=get_list_files()
	)

	st.session_state.target_text = fetch_file_content(sample_choice)
	target_text_input = st.text_area(
	value=st.session_state.target_text,
	label="Paste your own Term Of Service:",
	height=240
	)

	summarize_button = st.button(label="Try it!")

	# @st.cache(suppress_st_warning=True,
	# show_spinner=False,
	# allow_output_mutation=True,
	# hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
	# "tokenizers.Tokenizer": lambda _: None,
	# "tokenizers.AddedToken": lambda _: None,
	# }
	# )


	# def summary_from_cache(summary_sentence: tuple) -> tuple:
	# with st.spinner("Summarizing in progress..."):
	# return tuple(summarizer.abstractive_summary(list(summary_sentence)))

	def join_sentences(sentences: list) -> str:
	return " ".join([sentence for sentence in sentences])

	def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list:
	accumulated_lists = []
	result_list = []
	cumulative_token_length = 0
	for sentence in sentences:
	token_list = [token for token in nltk.word_tokenize(sentence) if token not in ['.']]
	token_length = len(token_list)
	if token_length + cumulative_token_length > split_token_length and result_list:
	accumulated_lists.append(join_sentences(result_list))
	result_list = [sentence]
	cumulative_token_length = token_length
	else:
	result_list.append(sentence)
	cumulative_token_length += token_length
	if result_list:
	accumulated_lists.append(join_sentences(result_list))
	return accumulated_lists

	pipe = create_pipeline()

	if summarize_button:
	if target_text_input is not "":
	with st.spinner("Summarizing in progress..."):
	sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input), 600)
	for sentence in sentences:
	summary_sentences = pipe(sentence)
	display_summary(summary_sentences["summary_text"].split("."))
	#output = pipe(sentence)
	#st.markdown(output["summary_text"])





	if __name__ == "__main__":
	main()