from sklearn import TfIdfVectorizer | |
import wordcloud | |
from pydantic import BaseModel | |
class WordCloudExtractor: | |
tfidf_params: Dict[str, Any] | |
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud: | |
pass | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from wordcloud import WordCloud | |
import numpy as np | |
class TextVisualization: | |
def extract_from_corpus(texts, max_features=100): | |
""" | |
Extract word frequencies from a corpus using TF-IDF vectorization | |
and generate word cloud frequencies. | |
Args: | |
texts: List of text documents | |
max_features: Maximum number of words to include | |
Returns: | |
Dictionary of word frequencies suitable for WordCloud | |
""" | |
# Initialize TF-IDF vectorizer | |
tfidf = TfidfVectorizer( | |
max_features=max_features, | |
stop_words='english', | |
lowercase=True | |
) | |
# Fit and transform the texts | |
tfidf_matrix = tfidf.fit_transform(texts) | |
# Get feature names (words) | |
feature_names = tfidf.get_feature_names_out() | |
# Calculate mean TF-IDF scores across documents | |
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() | |
# Create frequency dictionary | |
frequencies = dict(zip(feature_names, mean_tfidf)) | |
return frequencies | |