Spaces:
No application file
No application file
File size: 905 Bytes
4f084e5 0b7b04f 2fa1451 0b7b04f a186abb 0125fff 2fa1451 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datasets import load_dataset
import gradio as gr
text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
def update(text_dataset):
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
print('Printing sparse matrix:')
print(co_occurrences)
print('Printing dense matrix')
print(co_occurrences.todense())
sum_occ = np.sum(co_occurrences.todense(), axis=0)
print('Sum of word-word occurrences:')
print(sum_occ)
return sum_occ
with gr.Blocks() as app:
gr.Markdown("Click **Run** to start calculating.")
btn = gr.Button("Run")
btn.click(fn=update, inputs=text_dataset, outputs=out)
app.launch() |