File size: 4,874 Bytes
fbe3ac9
 
 
 
 
b21075f
 
fbe3ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d33b093
fbe3ac9
 
 
 
 
612e17b
 
 
 
 
 
 
 
fbe3ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
d33b093
 
 
 
 
 
 
 
fbe3ac9
 
d33b093
 
 
fbe3ac9
6d14e62
69f90b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d14e62
9c923b0
 
fbe3ac9
6d14e62
 
 
 
7360b2b
 
 
9c2c31c
2d14981
6d14e62
fbe3ac9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import html
import os
from typing import AnyStr

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import streamlit as st
import validators
from transformers import pipeline
from validators import ValidationFailure


def main() -> None:
    nltk.download("punkt")
    # header
    st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
    st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
                "difficult to understand. ")
    st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
                "we have prepared for you, then you will see the summary represented as the most important sentences.")
    st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
                "GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
    st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
    st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
                "so we suggest a careful reading of the document.")

    @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
    def create_pipeline():
        with st.spinner("Loading the model..."):
            tos_pipeline = pipeline(task="summarization",
                                    model="ML-unipi/bart-large-tos",
                                    tokenizer="ML-unipi/bart-large-tos"
                                    )
        return tos_pipeline

    def display_summary(summary_sentences: list) -> None:
        st.subheader("Summary :male-detective:")
        for senetence in summary_sentences:
            st.markdown(f"<li>{senetence}</li>", unsafe_allow_html=True)

    def get_list_files() -> list:
        names = []
        for file in os.listdir("./samples/"):
            if file.endswith(".txt"):
                names.append(file.replace(".txt", ""))

        return names

    def fetch_file_content(filename: str) -> AnyStr:
        with open(f"./samples/{filename.lower()}.txt", "r") as file:
            text = file.read()
        return text

    if "target_text" not in st.session_state:
        st.session_state.target_text = ""
    if "sentence_lenght" not in st.session_state:
        st.session_state.sentence_length = 15
    if "sample_choice" not in st.session_state:
        st.session_state.sentence_length = ""

    st.header("Input")

    # sentences_length = st.number_input(
    #     label="How many senetences to be extracted:",
    #     min_value=5,
    #     max_value=15,
    #     step=1,
    #     value=st.session_state.sentence_length
    # )

    sample_choice = st.selectbox(
        label="Select a sample:",
        options=get_list_files()
    )

    st.session_state.target_text = fetch_file_content(sample_choice)
    target_text_input = st.text_area(
        value=st.session_state.target_text,
        label="Paste your own Term Of Service:",
        height=240
    )

    summarize_button = st.button(label="Try it!")

    # @st.cache(suppress_st_warning=True,
    #           show_spinner=False,
    #           allow_output_mutation=True,
    #           hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
    #                       "tokenizers.Tokenizer": lambda _: None,
    #                       "tokenizers.AddedToken": lambda _: None,
    #                       }
    #           )


    # def summary_from_cache(summary_sentence: tuple) -> tuple:
    #     with st.spinner("Summarizing in progress..."):
    #         return tuple(summarizer.abstractive_summary(list(summary_sentence)))

    def split_text(text: str) -> list:
        sentences = sent_tokenize(text, language="english")

        token_count = 0
        text_block = ""
        result = []
        for sentence in sentences:
            tokens = word_tokenize(sentence, language="english", preserve_line=True)
            if token_count + len(tokens) < 500:
                token_count += len(tokens)
                text_block += " ".join(sentence)
            else:
                result.append(text_block)
                text_block = "".join(sentence)
                token_count = len(tokens)
        return result

    pipe = create_pipeline()

    if summarize_button:
        if target_text_input is not "":
            with st.spinner("Summarizing in progress..."):
                sentences = split_text(target_text_input)
                for sentence in sentences:
                    st.text(sentence)
                    #output = pipe(sentence)
                    #st.markdown(output["summary_text"])





if __name__ == "__main__":
    main()