from sklearn.feature_extraction.text import CountVectorizer import numpy as np from datasets import load_dataset import gradio as gr text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text']) def update(text_dataset): bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word") co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset) print('Printing sparse matrix:') print(co_occurrences) print('Printing dense matrix') print(co_occurrences.todense()) sum_occ = np.sum(co_occurrences.todense(), axis=0) print('Sum of word-word occurrences:') print(sum_occ) return sum_occ with gr.Blocks() as app: gr.Markdown("Click **Run** to start calculating.") btn = gr.Button("Run") btn.click(fn=update, inputs=text_dataset, outputs=out) app.launch()