KevlarVK commited on
Commit
a31f350
·
1 Parent(s): 07f9878

caching and regex fix

Browse files
Files changed (1) hide show
  1. Utils.py +7 -5
Utils.py CHANGED
@@ -7,7 +7,7 @@ import streamlit as st
7
  from youtube_transcript_api import YouTubeTranscriptApi
8
  import spacy
9
 
10
- @st.cache_data
11
  def fetch_article_text(url: str):
12
 
13
  r = requests.get(url)
@@ -15,13 +15,12 @@ def fetch_article_text(url: str):
15
  results = soup.find_all(["h1", "p"])
16
  text = [result.text for result in results]
17
  ARTICLE = " ".join(text)
18
-
19
- return ARTICLE
20
 
21
  def count_tokens(text: str):
22
  return len(text.split(" "))
23
 
24
- @st.cache_data
25
  def get_text_from_youtube_url(url: str):
26
 
27
  id = url.split("=")[1]
@@ -74,13 +73,16 @@ def add_punctuation(text: str):
74
 
75
 
76
  def get_input_chunks(text: str, max_length: int = 500):
 
 
 
77
  try:
78
  sentences = sent_tokenize(text)
79
  except:
80
  nltk.download('punkt')
81
  sentences = sent_tokenize(text)
82
 
83
- sentences = [re.sub(r'\[[0-9]*\]', ' ', sentence) for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4]
84
 
85
  input_chunks = []
86
  temp_sentences = ""
 
7
  from youtube_transcript_api import YouTubeTranscriptApi
8
  import spacy
9
 
10
+ @st.cache
11
  def fetch_article_text(url: str):
12
 
13
  r = requests.get(url)
 
15
  results = soup.find_all(["h1", "p"])
16
  text = [result.text for result in results]
17
  ARTICLE = " ".join(text)
18
+ return re.sub(r'\[\d+\]', '', ARTICLE)
 
19
 
20
  def count_tokens(text: str):
21
  return len(text.split(" "))
22
 
23
+ @st.cache
24
  def get_text_from_youtube_url(url: str):
25
 
26
  id = url.split("=")[1]
 
73
 
74
 
75
  def get_input_chunks(text: str, max_length: int = 500):
76
+
77
+ text = re.sub(r'\[\d+\]', '', text)
78
+
79
  try:
80
  sentences = sent_tokenize(text)
81
  except:
82
  nltk.download('punkt')
83
  sentences = sent_tokenize(text)
84
 
85
+ sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4]
86
 
87
  input_chunks = []
88
  temp_sentences = ""