github_search_visualizations / text_visualization.py
lambdaofgod's picture
feat: Implement `extract_from_corpus` in `text_visualization.py`
42de6bd
raw
history blame
1.48 kB
from sklearn import TfIdfVectorizer
import wordcloud
from pydantic import BaseModel
class WordCloudExtractor:
tfidf_params: Dict[str, Any]
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
pass
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import numpy as np
class TextVisualization:
@staticmethod
def extract_from_corpus(texts, max_features=100):
"""
Extract word frequencies from a corpus using TF-IDF vectorization
and generate word cloud frequencies.
Args:
texts: List of text documents
max_features: Maximum number of words to include
Returns:
Dictionary of word frequencies suitable for WordCloud
"""
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
max_features=max_features,
stop_words='english',
lowercase=True
)
# Fit and transform the texts
tfidf_matrix = tfidf.fit_transform(texts)
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()
# Calculate mean TF-IDF scores across documents
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
# Create frequency dictionary
frequencies = dict(zip(feature_names, mean_tfidf))
return frequencies