Spaces:
No application file
No application file
from gensim import corpora, models | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
import string | |
import nltk | |
# Download necessary NLTK resources | |
nltk.download("stopwords") | |
nltk.download("punkt") | |
def preprocess_text(text_data): | |
""" | |
Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens. | |
:param text_data: List of raw text documents | |
:return: List of preprocessed tokenized texts | |
""" | |
stop_words = set(stopwords.words("english")) | |
processed_texts = [ | |
[ | |
word for word in word_tokenize(document.lower()) | |
if word not in stop_words and word not in string.punctuation and word.isalpha() | |
] | |
for document in text_data | |
] | |
return processed_texts | |
def train_lda(texts, num_topics=3): | |
""" | |
Trains an LDA model on the given preprocessed text data. | |
:param texts: List of tokenized texts | |
:param num_topics: Number of topics for the LDA model | |
:return: Trained LDA model and corresponding dictionary | |
""" | |
dictionary = corpora.Dictionary(texts) | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10) | |
return ldamodel, dictionary | |
def extract_topic_words(ldamodel, num_topics=3, num_words=3): | |
""" | |
Extracts meaningful words from each topic identified by the LDA model. | |
:param ldamodel: Trained LDA model | |
:param num_topics: Number of topics to extract | |
:param num_words: Number of words per topic to consider | |
:return: List of top words representing each topic | |
""" | |
topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words) | |
topic_names = [] | |
for topic in topics: | |
words = topic[1].split(" + ") | |
for word_data in words: | |
word = word_data.split("*")[1].strip('"') # Extract word | |
if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word | |
topic_names.append(word) | |
break # Only take the top valid word | |
return list(set(topic_names)) # Ensure unique topics | |