DjPapzin commited on
Commit
2defa07
·
1 Parent(s): 0e01c28

nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')

try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')

# --- Preprocess text function (moved outside session state) ---
def preprocess_text(text):
# Convert to lowercase
text = text.lower()

cleaned_text = re.sub(r'[^a-zA-Z0-9\s\,]', ' ', text)
# Tokenize text
tokens = word_tokenize(cleaned_text)

# Remove stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]

# Rejoin tokens into a single string
cleaned_text = ' '.join(tokens)

return cleaned_text

Files changed (1) hide show
  1. frontend/app.py +5 -0
frontend/app.py CHANGED
@@ -28,6 +28,11 @@ try:
28
  except LookupError:
29
  nltk.download('punkt')
30
 
 
 
 
 
 
31
  # --- Preprocess text function (moved outside session state) ---
32
  def preprocess_text(text):
33
  # Convert to lowercase
 
28
  except LookupError:
29
  nltk.download('punkt')
30
 
31
+ try:
32
+ nltk.data.find('corpora/stopwords')
33
+ except LookupError:
34
+ nltk.download('stopwords')
35
+
36
  # --- Preprocess text function (moved outside session state) ---
37
  def preprocess_text(text):
38
  # Convert to lowercase