v1shal's picture
Update approach_api/utils/topic_extraction.py
97ea67b verified
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
# Download necessary NLTK resources
nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt_tab')
def preprocess_text(text_data):
"""
Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
:param text_data: List of raw text documents
:return: List of preprocessed tokenized texts
"""
stop_words = set(stopwords.words("english"))
processed_texts = [
[
word for word in word_tokenize(document.lower())
if word not in stop_words and word not in string.punctuation and word.isalpha()
]
for document in text_data
]
return processed_texts
def train_lda(texts, num_topics=3):
"""
Trains an LDA model on the given preprocessed text data.
:param texts: List of tokenized texts
:param num_topics: Number of topics for the LDA model
:return: Trained LDA model and corresponding dictionary
"""
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
return ldamodel, dictionary
def extract_topic_words(ldamodel, num_topics=3, num_words=3):
"""
Extracts meaningful words from each topic identified by the LDA model.
:param ldamodel: Trained LDA model
:param num_topics: Number of topics to extract
:param num_words: Number of words per topic to consider
:return: List of top words representing each topic
"""
topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
topic_names = []
for topic in topics:
words = topic[1].split(" + ")
for word_data in words:
word = word_data.split("*")[1].strip('"') # Extract word
if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
topic_names.append(word)
break # Only take the top valid word
return list(set(topic_names)) # Ensure unique topics