Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Dec 16, 2021

Commit

6303415

1 Parent(s): 07c617e

new visu

Browse files

Files changed (2) hide show

app.py +41 -15
filtering_pipeline_oscar.pdf +0 -0

app.py CHANGED Viewed

@@ -2,6 +2,9 @@
 import streamlit as st
 import json
 import pandas as pd
@@ -12,14 +15,27 @@ import matplotlib.pyplot as plt
 class Visualization:
     def __init__(
-        self, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
     ):
         self.path_data = path_data
         self.lang = lang
         self.num_docs = num_docs
         self.num_docs_for_words = num_docs_for_words
         self.max_len_text_display = max_len_text_display
     def open_data(self):
         with open(self.path_data) as json_file:
             data = json.load(json_file)
@@ -42,7 +58,7 @@ class Visualization:
         self.docs = pd.DataFrame(docs)
     def set_title(self):
-        st.title(f"{self.num_docs} {self.lang} documents from Oscar with their stats.")
     def filtering_of_docs(self):
         st.sidebar.subheader("Parameters of the filtering on documents")
@@ -59,14 +75,15 @@ class Visualization:
             def print_discared_by_cond(cond):
                 st.sidebar.caption(
-                    f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter"
                 )
                 st.sidebar.caption("---------")
             if "number_words" in columns:
                 max_nb_words = int(np.max(docs["number_words"])) + 1
                 cutoff_min_number_words = st.sidebar.slider(
-                    "Min cutoff number words", 0, max_nb_words, 0
                 )
                 new_key = ("number_words", cutoff_min_number_words, False)
                 keys.append(new_key)
@@ -74,8 +91,9 @@ class Visualization:
                 conds.append(cond)
                 print_discared_by_cond(cond)
                 cutoff_max_number_words = st.sidebar.slider(
-                    "Max cutoff number words", 0, max_nb_words, max_nb_words
                 )
                 new_key = ("number_words", cutoff_max_number_words, True)
                 keys.append(new_key)
@@ -84,8 +102,9 @@ class Visualization:
                 print_discared_by_cond(cond)
             if "special_characters_ratio" in columns:
                 cutoff_special_characters_ratio = st.sidebar.slider(
-                    "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
                 )
                 new_key = (
                     "special_characters_ratio",
@@ -98,8 +117,9 @@ class Visualization:
                 print_discared_by_cond(cond)
             if "stopwords_ratio" in columns:
                 cutoff_stopwords_ratio = st.sidebar.slider(
-                    "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
                 )
                 new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
                 keys.append(new_key)
@@ -108,8 +128,9 @@ class Visualization:
                 print_discared_by_cond(cond)
             if "badwords_ratio" in columns:
                 cutoff_badwords_ratio = st.sidebar.slider(
-                    "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.01
                 )
                 new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
                 keys.append(new_key)
@@ -118,8 +139,9 @@ class Visualization:
                 print_discared_by_cond(cond)
             if "lang_id_score" in columns:
                 cutoff_lang_id_score = st.sidebar.slider(
-                    "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
                 )
                 new_key = ("lang_id_score", cutoff_lang_id_score, False)
                 keys.append(new_key)
@@ -128,9 +150,10 @@ class Visualization:
                 print_discared_by_cond(cond)
             if "perplexity_score" in columns:
                 max_pp = int(np.max(docs["perplexity_score"])) + 1
                 cutoff_perplexity_score = st.sidebar.slider(
-                    "Perplexity cutoff perplexity score", 0, max_pp, max_pp
                 )
                 new_key = ("perplexity_score", cutoff_perplexity_score, True)
                 keys.append(new_key)
@@ -167,13 +190,14 @@ class Visualization:
     def filtering_of_words(self):
         st.sidebar.subheader("Parameter of the filtering on words")
-        max_len_word = int(np.max(self.words["len_word"])) + 1
-        cutoff_word = st.sidebar.slider(
-            "Max cutoff length word", 0, max_len_word, max_len_word
         )
         incorrect_substrings = st.sidebar.checkbox(
-            "Remove words with incorrect substrings"
         )
         cond_words = self.words["len_word"] <= cutoff_word
@@ -258,6 +282,7 @@ class Visualization:
             )
     def visualization(self):
         self.open_data()
         self.set_title()
         self.filtering_of_docs()
@@ -267,6 +292,7 @@ class Visualization:
         self.download_data()
 path_data = "./en_examples_with_stats.json"
 lang = "English"
 num_docs = 5000
@@ -274,6 +300,6 @@ num_docs_for_words = 500
 max_len_text_display = 10000
 visualization = Visualization(
-    path_data, lang, num_docs, num_docs_for_words, max_len_text_display
 )
 visualization.visualization()

 import streamlit as st
+import os
+import base64
 import json
 import pandas as pd
 class Visualization:
     def __init__(
+        self, path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
     ):
+        self.path_instructions = path_instructions
         self.path_data = path_data
         self.lang = lang
         self.num_docs = num_docs
         self.num_docs_for_words = num_docs_for_words
         self.max_len_text_display = max_len_text_display
+    def preamble(self):
+        st.markdown("Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail.")
+        def get_binary_file_downloader_html(bin_file, file_label='File'):
+            with open(bin_file, 'rb') as f:
+                data = f.read()
+            bin_str = base64.b64encode(data).decode()
+            href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
+            return href
+        st.markdown(get_binary_file_downloader_html(self.path_instructions, "Download the filtering pipeline of OSCAR as pdf"), unsafe_allow_html=True)
     def open_data(self):
         with open(self.path_data) as json_file:
             data = json.load(json_file)
         self.docs = pd.DataFrame(docs)
     def set_title(self):
+        st.title(f"{self.num_docs} {self.lang} documents from OSCAR with their stats.")
     def filtering_of_docs(self):
         st.sidebar.subheader("Parameters of the filtering on documents")
             def print_discared_by_cond(cond):
                 st.sidebar.caption(
+                    f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
                 )
                 st.sidebar.caption("---------")
             if "number_words" in columns:
+                cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
                 max_nb_words = int(np.max(docs["number_words"])) + 1
                 cutoff_min_number_words = st.sidebar.slider(
+                    cutoff_def, 0, min(max_nb_words, 500), 0
                 )
                 new_key = ("number_words", cutoff_min_number_words, False)
                 keys.append(new_key)
                 conds.append(cond)
                 print_discared_by_cond(cond)
+                cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
                 cutoff_max_number_words = st.sidebar.slider(
+                    cutoff_def, 0, max_nb_words, max_nb_words
                 )
                 new_key = ("number_words", cutoff_max_number_words, True)
                 keys.append(new_key)
                 print_discared_by_cond(cond)
             if "special_characters_ratio" in columns:
+                cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
                 cutoff_special_characters_ratio = st.sidebar.slider(
+                    cutoff_def, 0.0, 1.0, 1.0, step=0.01
                 )
                 new_key = (
                     "special_characters_ratio",
                 print_discared_by_cond(cond)
             if "stopwords_ratio" in columns:
+                cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
                 cutoff_stopwords_ratio = st.sidebar.slider(
+                    cutoff_def, 0.0, 1.0, 0.0, step=0.01
                 )
                 new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
                 keys.append(new_key)
                 print_discared_by_cond(cond)
             if "badwords_ratio" in columns:
+                cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
                 cutoff_badwords_ratio = st.sidebar.slider(
+                    cutoff_def, 0.0, 1.0, 1.0, step=0.01
                 )
                 new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
                 keys.append(new_key)
                 print_discared_by_cond(cond)
             if "lang_id_score" in columns:
+                cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
                 cutoff_lang_id_score = st.sidebar.slider(
+                    cutoff_def, 0.0, 1.0, 0.0, step=0.01
                 )
                 new_key = ("lang_id_score", cutoff_lang_id_score, False)
                 keys.append(new_key)
                 print_discared_by_cond(cond)
             if "perplexity_score" in columns:
+                cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
                 max_pp = int(np.max(docs["perplexity_score"])) + 1
                 cutoff_perplexity_score = st.sidebar.slider(
+                    cutoff_def, 0, max_pp, max_pp
                 )
                 new_key = ("perplexity_score", cutoff_perplexity_score, True)
                 keys.append(new_key)
     def filtering_of_words(self):
         st.sidebar.subheader("Parameter of the filtering on words")
+        cutoff_def = (
+            "If the length of a word is higher than this number, the word is removed."
         )
+        max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
+        cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
         incorrect_substrings = st.sidebar.checkbox(
+            "Remove words with incorrect substrings."
         )
         cond_words = self.words["len_word"] <= cutoff_word
             )
     def visualization(self):
+        self.preamble()
         self.open_data()
         self.set_title()
         self.filtering_of_docs()
         self.download_data()
+path_instructions = "./filtering_pipeline_oscar.pdf"
 path_data = "./en_examples_with_stats.json"
 lang = "English"
 num_docs = 5000
 max_len_text_display = 10000
 visualization = Visualization(
+    path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
 )
 visualization.visualization()

filtering_pipeline_oscar.pdf ADDED Viewed

Binary file (196 kB). View file