tommasobaldi commited on
Commit
b21075f
·
1 Parent(s): 2d14981

update requirements.txt

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -3,6 +3,8 @@ import os
3
  from typing import AnyStr
4
 
5
  import nltk
 
 
6
  import streamlit as st
7
  import validators
8
  from transformers import pipeline
@@ -96,20 +98,20 @@ def main() -> None:
96
  # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
97
 
98
  def split_text(text: str) -> list:
99
- tokens = nltk.tokenize(text)
100
- sentences = []
101
  token_count = 0
102
- sentence = ""
103
- for token in tokens:
104
- if token_count < 1024:
105
- sentence += "".join(token + " ")
106
- token_count += 1
 
 
107
  else:
108
- sentences.append(sentence)
109
- token_count = 0
110
- sentence = ""
111
-
112
- return sentences
113
 
114
  pipe = create_pipeline()
115
 
 
3
  from typing import AnyStr
4
 
5
  import nltk
6
+ from nltk.tokenize import sent_tokenize
7
+ from nltk.tokenize import word_tokenize
8
  import streamlit as st
9
  import validators
10
  from transformers import pipeline
 
98
  # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
99
 
100
  def split_text(text: str) -> list:
101
+ sentences = sent_tokenize(text)
 
102
  token_count = 0
103
+ text_block = ""
104
+ result = []
105
+ for sentence in sentences:
106
+ tokens = word_tokenize(sentence)
107
+ if token_count + len(tokens) < 1024:
108
+ token_count += len(tokens)
109
+ text_block += " ".join(sentence)
110
  else:
111
+ result.append(text_block)
112
+ text_block = "".join(sentence)
113
+ token_count = len(tokens)
114
+ return result
 
115
 
116
  pipe = create_pipeline()
117