meg HF staff commited on
Commit
4f084e5
1 Parent(s): a186abb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -1,16 +1,15 @@
1
- import gradio as gr
2
- import json
3
  import numpy as np
4
- import pandas as pd
5
- from sklearn.feature_extraction.text import TfidfVectorizer
6
-
7
  from datasets import load_dataset
8
 
9
  text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
10
 
11
- def greet(name):
12
- print(text_dataset)
13
- return "Hello " + name + "!!"
14
-
15
- app = gr.Interface(fn=greet, inputs="text", outputs="text")
16
- app.launch()
 
 
 
 
1
+ from sklearn.feature_extraction.text import CountVectorizer
 
2
  import numpy as np
 
 
 
3
  from datasets import load_dataset
4
 
5
  text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
6
 
7
+ bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
8
+ co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
9
+ print('Printing sparse matrix:')
10
+ print(co_occurrences)
11
+ print('Printing dense matrix')
12
+ print(co_occurrences.todense())
13
+ sum_occ = np.sum(co_occurrences.todense(), axis=0)
14
+ print('Sum of word-word occurrences:')
15
+ print(sum_occ)