Spaces:
Sleeping
Sleeping
try:
Browse filesnltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
# --- Preprocess text function (moved outside session state) ---
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
cleaned_text = re.sub(r'[^a-zA-Z0-9\s\,]', ' ', text)
# Tokenize text
tokens = word_tokenize(cleaned_text)
# Remove stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Rejoin tokens into a single string
cleaned_text = ' '.join(tokens)
return cleaned_text
- frontend/app.py +5 -0
frontend/app.py
CHANGED
@@ -28,6 +28,11 @@ try:
|
|
28 |
except LookupError:
|
29 |
nltk.download('punkt')
|
30 |
|
|
|
|
|
|
|
|
|
|
|
31 |
# --- Preprocess text function (moved outside session state) ---
|
32 |
def preprocess_text(text):
|
33 |
# Convert to lowercase
|
|
|
28 |
except LookupError:
|
29 |
nltk.download('punkt')
|
30 |
|
31 |
+
try:
|
32 |
+
nltk.data.find('corpora/stopwords')
|
33 |
+
except LookupError:
|
34 |
+
nltk.download('stopwords')
|
35 |
+
|
36 |
# --- Preprocess text function (moved outside session state) ---
|
37 |
def preprocess_text(text):
|
38 |
# Convert to lowercase
|