Spaces:
No application file
No application file
from sklearn.feature_extraction.text import CountVectorizer | |
import numpy as np | |
from datasets import load_dataset | |
import gradio as gr | |
text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text']) | |
def update(text_dataset): | |
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word") | |
co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset) | |
print('Printing sparse matrix:') | |
print(co_occurrences) | |
print('Printing dense matrix') | |
print(co_occurrences.todense()) | |
sum_occ = np.sum(co_occurrences.todense(), axis=0) | |
print('Sum of word-word occurrences:') | |
print(sum_occ) | |
return sum_occ | |
with gr.Blocks() as app: | |
gr.Markdown("Click **Run** to start calculating.") | |
btn = gr.Button("Run") | |
btn.click(fn=update, inputs=text_dataset, outputs=out) | |
app.launch() |