Spaces:

v1shal
/

News_Summarisation_and_Sentiment_Analysis

Sleeping

App Files Files Community

News_Summarisation_and_Sentiment_Analysis / approach_api /utils /topic_extraction.py

v1shal

Update approach_api/utils/topic_extraction.py

97ea67b verified 4 months ago

raw

history blame contribute delete

2.19 kB

	from gensim import corpora, models
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import string
	import nltk

	# Download necessary NLTK resources
	nltk.download('all')
	nltk.download('stopwords')
	nltk.download('punkt_tab')


	def preprocess_text(text_data):
	"""
	Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.

	:param text_data: List of raw text documents
	:return: List of preprocessed tokenized texts
	"""
	stop_words = set(stopwords.words("english"))
	processed_texts = [
	[
	word for word in word_tokenize(document.lower())
	if word not in stop_words and word not in string.punctuation and word.isalpha()
	]
	for document in text_data
	]
	return processed_texts

	def train_lda(texts, num_topics=3):
	"""
	Trains an LDA model on the given preprocessed text data.

	:param texts: List of tokenized texts
	:param num_topics: Number of topics for the LDA model
	:return: Trained LDA model and corresponding dictionary
	"""
	dictionary = corpora.Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]

	ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

	return ldamodel, dictionary

	def extract_topic_words(ldamodel, num_topics=3, num_words=3):
	"""
	Extracts meaningful words from each topic identified by the LDA model.

	:param ldamodel: Trained LDA model
	:param num_topics: Number of topics to extract
	:param num_words: Number of words per topic to consider
	:return: List of top words representing each topic
	"""
	topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
	topic_names = []

	for topic in topics:
	words = topic[1].split(" + ")
	for word_data in words:
	word = word_data.split("*")[1].strip('"') # Extract word
	if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
	topic_names.append(word)
	break # Only take the top valid word

	return list(set(topic_names)) # Ensure unique topics