Spaces:
No application file
No application file
File size: 2,160 Bytes
b396e94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
def preprocess_text(text_data):
"""
Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
:param text_data: List of raw text documents
:return: List of preprocessed tokenized texts
"""
stop_words = set(stopwords.words("english"))
processed_texts = [
[
word for word in word_tokenize(document.lower())
if word not in stop_words and word not in string.punctuation and word.isalpha()
]
for document in text_data
]
return processed_texts
def train_lda(texts, num_topics=3):
"""
Trains an LDA model on the given preprocessed text data.
:param texts: List of tokenized texts
:param num_topics: Number of topics for the LDA model
:return: Trained LDA model and corresponding dictionary
"""
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
return ldamodel, dictionary
def extract_topic_words(ldamodel, num_topics=3, num_words=3):
"""
Extracts meaningful words from each topic identified by the LDA model.
:param ldamodel: Trained LDA model
:param num_topics: Number of topics to extract
:param num_words: Number of words per topic to consider
:return: List of top words representing each topic
"""
topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
topic_names = []
for topic in topics:
words = topic[1].split(" + ")
for word_data in words:
word = word_data.split("*")[1].strip('"') # Extract word
if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
topic_names.append(word)
break # Only take the top valid word
return list(set(topic_names)) # Ensure unique topics
|