tommasobaldi commited on
Commit
146d058
·
1 Parent(s): 1ba1fd2

working on text splitting

Browse files
Files changed (1) hide show
  1. app.py +27 -60
app.py CHANGED
@@ -1,14 +1,8 @@
1
- import html
2
  import os
3
  from typing import AnyStr
4
-
5
  import nltk
6
- from nltk.tokenize import sent_tokenize
7
- from nltk.tokenize import word_tokenize
8
  import streamlit as st
9
- import validators
10
  from transformers import pipeline
11
- from validators import ValidationFailure
12
 
13
 
14
  def main() -> None:
@@ -52,51 +46,6 @@ def main() -> None:
52
  text = file.read()
53
  return text
54
 
55
- if "target_text" not in st.session_state:
56
- st.session_state.target_text = ""
57
- if "sentence_lenght" not in st.session_state:
58
- st.session_state.sentence_length = 15
59
- if "sample_choice" not in st.session_state:
60
- st.session_state.sentence_length = ""
61
-
62
- st.header("Input")
63
-
64
- # sentences_length = st.number_input(
65
- # label="How many senetences to be extracted:",
66
- # min_value=5,
67
- # max_value=15,
68
- # step=1,
69
- # value=st.session_state.sentence_length
70
- # )
71
-
72
- sample_choice = st.selectbox(
73
- label="Select a sample:",
74
- options=get_list_files()
75
- )
76
-
77
- st.session_state.target_text = fetch_file_content(sample_choice)
78
- target_text_input = st.text_area(
79
- value=st.session_state.target_text,
80
- label="Paste your own Term Of Service:",
81
- height=240
82
- )
83
-
84
- summarize_button = st.button(label="Try it!")
85
-
86
- # @st.cache(suppress_st_warning=True,
87
- # show_spinner=False,
88
- # allow_output_mutation=True,
89
- # hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
90
- # "tokenizers.Tokenizer": lambda _: None,
91
- # "tokenizers.AddedToken": lambda _: None,
92
- # }
93
- # )
94
-
95
-
96
- # def summary_from_cache(summary_sentence: tuple) -> tuple:
97
- # with st.spinner("Summarizing in progress..."):
98
- # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
99
-
100
  def join_sentences(sentences: list) -> str:
101
  return " ".join([sentence for sentence in sentences])
102
 
@@ -120,20 +69,38 @@ def main() -> None:
120
 
121
  pipe = create_pipeline()
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  if summarize_button:
124
  if target_text_input is not "":
125
- summary_sentences = ""
126
  with st.spinner("Summarizing in progress..."):
127
  sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input), 600)
128
  for sentence in sentences:
129
- summary_sentences += "".join(pipe(sentence)["summary_text"])
130
-
131
- display_summary(summary_sentences.split("."))
132
- #output = pipe(sentence)
133
- #st.markdown(output["summary_text"])
134
-
135
-
136
-
137
 
138
 
139
  if __name__ == "__main__":
 
 
1
  import os
2
  from typing import AnyStr
 
3
  import nltk
 
 
4
  import streamlit as st
 
5
  from transformers import pipeline
 
6
 
7
 
8
  def main() -> None:
 
46
  text = file.read()
47
  return text
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def join_sentences(sentences: list) -> str:
50
  return " ".join([sentence for sentence in sentences])
51
 
 
69
 
70
  pipe = create_pipeline()
71
 
72
+ if "target_text" not in st.session_state:
73
+ st.session_state.target_text = ""
74
+ if "sentence_lenght" not in st.session_state:
75
+ st.session_state.sentence_length = 15
76
+ if "sample_choice" not in st.session_state:
77
+ st.session_state.sentence_length = ""
78
+
79
+ st.header("Input")
80
+ sample_choice = st.selectbox(
81
+ label="Select a sample:",
82
+ options=get_list_files()
83
+ )
84
+
85
+ st.session_state.target_text = fetch_file_content(sample_choice)
86
+ target_text_input = st.text_area(
87
+ value=st.session_state.target_text,
88
+ label="Paste your own Term Of Service:",
89
+ height=240
90
+ )
91
+
92
+ summarize_button = st.button(label="Try it!")
93
+
94
  if summarize_button:
95
  if target_text_input is not "":
96
+ summary_sentences = []
97
  with st.spinner("Summarizing in progress..."):
98
  sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input), 600)
99
  for sentence in sentences:
100
+ output = pipe(sentence)
101
+ summary = output["summary_text"]
102
+ summary_sentences.append(summary.split("."))
103
+ display_summary(summary_sentences)
 
 
 
 
104
 
105
 
106
  if __name__ == "__main__":