File size: 2,185 Bytes
b396e94
 
 
 
 
 
 
1a2e35e
 
 
b396e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

# Download necessary NLTK resources
nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_text(text_data):
    """
    Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.

    :param text_data: List of raw text documents
    :return: List of preprocessed tokenized texts
    """
    stop_words = set(stopwords.words("english"))
    processed_texts = [
        [
            word for word in word_tokenize(document.lower())
            if word not in stop_words and word not in string.punctuation and word.isalpha()
        ]
        for document in text_data
    ]
    return processed_texts

def train_lda(texts, num_topics=3):
    """
    Trains an LDA model on the given preprocessed text data.

    :param texts: List of tokenized texts
    :param num_topics: Number of topics for the LDA model
    :return: Trained LDA model and corresponding dictionary
    """
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    
    return ldamodel, dictionary

def extract_topic_words(ldamodel, num_topics=3, num_words=3):
    """
    Extracts meaningful words from each topic identified by the LDA model.

    :param ldamodel: Trained LDA model
    :param num_topics: Number of topics to extract
    :param num_words: Number of words per topic to consider
    :return: List of top words representing each topic
    """
    topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
    topic_names = []

    for topic in topics:
        words = topic[1].split(" + ")
        for word_data in words:
            word = word_data.split("*")[1].strip('"')  # Extract word
            if word.isalpha() and len(word) > 2:  # Ensure it's a meaningful word
                topic_names.append(word)
                break  # Only take the top valid word

    return list(set(topic_names))  # Ensure unique topics