Spaces:

avid-ml
/

biasaware

Running

App Files Files Community

freyam commited on Nov 3, 2023

Commit

0998e6d

1 Parent(s): 8ab9329

Fix HuggingFace global search

Browse files

Files changed (1) hide show

app.py +123 -94

app.py CHANGED Viewed

@@ -1,26 +1,24 @@
-import os
 import json
 import timeit
 import gradio as gr
 import pandas as pd
-from datetime import date
 from scripts.genbit import *
-from scripts.gender_profession_bias import *
 from scripts.gender_distribution import *
-from datasets import load_dataset as hf_load_dataset
-from huggingface_hub import DatasetFilter, list_datasets
-from avidtools.datamodels.report import Report
-from avidtools.datamodels.components import *
-from avidtools.datamodels.enums import *
-MAX_THRESHOLD = 1000
-METHODOLOGIES = json.load(open("config/methodologies.json", "r", encoding="utf8"))
-DATASET = {
-    "name": None,
     "source": None,
     "df": None,
     "sampling_method": None,
@@ -32,15 +30,15 @@ DATASET = {
 def generate_avid_report():
-    dataset_id = DATASET["name"]
-    methodology = DATASET["methodology"]
-    result_json = DATASET["result_df"].to_dict(orient="list")
     report = Report()
     report.affects = Affects(
         developer=[],
-        deployer=["Hugging Face"] if DATASET["source"] == "HuggingFace Hub" else [],
         artifacts=[Artifact(type=ArtifactTypeEnum.dataset, name=dataset_id)],
     )
     report.problemtype = Problemtype(
@@ -60,13 +58,14 @@ def generate_avid_report():
     report.references = (
         [
             Reference(
-                label="""{dataset_id} on Hugging Face""".format(model_id=dataset_id),
-                url="""https://huggingface.co/{dataset_id}""".format(
                     dataset_id=dataset_id
                 ),
             )
         ]
-        if DATASET["source"] == "HuggingFace Hub"
         else []
     )
     report.description = LangValue(
@@ -87,16 +86,16 @@ def generate_avid_report():
 def evaluate():
-    if DATASET["methodology"] == "GenBiT (Microsoft Gender Bias Tool)":
-        DATASET["sampling_size"] = min(DATASET["sampling_size"], 100)
     print(
-        f"Dataset          : {DATASET['name']}\n"
-        f"Source           : {DATASET['source']}\n"
-        f"Sampling Method  : {DATASET['sampling_method']}\n"
-        f"Sampling Size    : {DATASET['sampling_size']}\n"
-        f"Column           : {DATASET['column']}\n"
-        f"Methodology      : {DATASET['methodology']}\n"
         f"Time Taken       : ",
         end="",
     )
@@ -104,21 +103,21 @@ def evaluate():
     try:
         start = timeit.default_timer()
-        data = DATASET["df"].copy()
-        data = data[[DATASET["column"]]]
-        if DATASET["sampling_method"] == "First":
-            data = data.head(DATASET["sampling_size"])
-        elif DATASET["sampling_method"] == "Last":
-            data = data.tail(DATASET["sampling_size"])
-        elif DATASET["sampling_method"] == "Random":
-            data = data.sample(n=DATASET["sampling_size"], random_state=42)
         result_df, result_plot = globals()[
-            METHODOLOGIES.get(DATASET["methodology"]).get("fx")
         ](data)
-        DATASET["result_df"] = result_df
         stop = timeit.default_timer()
@@ -141,20 +140,19 @@ def evaluate():
 def load_dataset(local_dataset, hf_dataset):
-    DATASET["name"] = (
-        os.path.splitext(os.path.basename(local_dataset.name))[0]
-        if local_dataset
-        else hf_dataset
-    )
-    DATASET["source"] = "Local Dataset" if local_dataset else "HuggingFace Hub"
-    DATASET["df"] = (
-        pd.read_csv(local_dataset.name)
-        if local_dataset
-        else hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas()
-    )
-    columns = DATASET["df"].select_dtypes(include=["object"]).columns.tolist()
-    column_corpus = DATASET["df"][columns[0]].tolist()[:5]
     dataset_sampling_method = gr.Radio(
         label="Scope",
@@ -167,10 +165,10 @@ def load_dataset(local_dataset, hf_dataset):
     dataset_sampling_size = gr.Slider(
         label=f"Number of Entries",
-        info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
         minimum=1,
-        maximum=min(DATASET["df"].shape[0], MAX_THRESHOLD),
-        value=min(DATASET["df"].shape[0], MAX_THRESHOLD),
         visible=True,
         interactive=True,
     )
@@ -204,44 +202,27 @@ def load_dataset(local_dataset, hf_dataset):
     )
-def show_hf_dataset_search_results(hf_dataset):
-    choices = [
-        dataset.id
-        for dataset in list_datasets(
-            filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10
-        )
-    ]
-    return (
-        gr.Button(
-            value=f"Load",
-            interactive=True,
-            variant="secondary",
-            visible=True,
-        ),
-        gr.Radio(
-            label="HuggingFace Hub Search Results",
-            info="Select the dataset to be imported",
-            choices=choices,
-            value=choices[0],
-            interactive=True,
-            visible=True,
-        ),
-    )
 def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
-    DATASET["sampling_method"] = dataset_sampling_method
-    DATASET["sampling_size"] = dataset_sampling_size
-    DATASET["column"] = dataset_column
     return (
         gr.Markdown(
-            f"## Results (Dataset: {'✅' if DATASET['name'] else '❎'}) (Methodology: {'✅' if DATASET['methodology'] else '❎'})"
         ),
         gr.Button(
             value="Evaluate",
-            interactive=(True if DATASET["name"] and DATASET["methodology"] else False),
             variant="primary",
             visible=True,
         ),
@@ -249,11 +230,16 @@ def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_colum
 def import_methodology(methodology):
-    DATASET["methodology"] = methodology
     return (
         gr.Markdown(
-            f"## Results (Dataset: {'✅' if DATASET['name'] else '❎'}) (Methodology: {'✅' if DATASET['methodology'] else '❎'})"
         ),
         gr.Markdown(
             METHODOLOGIES[methodology]["description"],
@@ -261,7 +247,11 @@ def import_methodology(methodology):
         ),
         gr.Button(
             value="Evaluate",
-            interactive=(True if DATASET["name"] and DATASET["methodology"] else False),
             variant="primary",
             visible=True,
         ),
@@ -330,7 +320,9 @@ with BiasAware:
             methodology_description = gr.Markdown(visible=False)
         with gr.Column(scale=2):
-            result_title = gr.Markdown("## Results (Dataset: ❎) (Methodology: ❎)")
             evaluation_btn = gr.Button(
                 value="Evaluate",
@@ -361,7 +353,7 @@ with BiasAware:
             gr.Textbox(
                 label="HuggingFace Hub",
                 placeholder="Search for a dataset",
-                value="amazon_multi",
                 interactive=True,
                 visible=True,
             )
@@ -399,7 +391,12 @@ with BiasAware:
     )
     hf_dataset.submit(
-        fn=show_hf_dataset_search_results,
         inputs=[hf_dataset],
         outputs=[dataset_load_btn],
     )
@@ -419,7 +416,7 @@ with BiasAware:
     dataset_column.input(
         fn=lambda column: gr.Dataframe(
             value=pd.DataFrame(
-                {f"{column}": DATASET["df"][column].tolist()[:5]},
             ),
             visible=True,
         ),
@@ -456,3 +453,35 @@ with BiasAware:
 if __name__ == "__main__":
     BiasAware.launch()

 import json
+import os
 import timeit
+from datetime import date
 import gradio as gr
 import pandas as pd
+from avidtools.datamodels.components import *
+from avidtools.datamodels.enums import *
+from avidtools.datamodels.report import Report
+from datasets import load_dataset as hf_load_dataset
 from scripts.genbit import *
 from scripts.gender_distribution import *
+from scripts.gender_profession_bias import *
+SAMPLING_SIZE_THRESHOLD = 2000
+METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
+EVALUATION = {
+    "dataset_id": None,
     "source": None,
     "df": None,
     "sampling_method": None,
 def generate_avid_report():
+    dataset_id = EVALUATION["dataset_id"]
+    methodology = EVALUATION["methodology"]
+    result_json = EVALUATION["result_df"].to_dict(orient="list")
     report = Report()
     report.affects = Affects(
         developer=[],
+        deployer=["Hugging Face"] if EVALUATION["source"] == "HuggingFace Hub" else [],
         artifacts=[Artifact(type=ArtifactTypeEnum.dataset, name=dataset_id)],
     )
     report.problemtype = Problemtype(
     report.references = (
         [
             Reference(
+                type="",
+                label="""{dataset_id} on Hugging Face""".format(dataset_id=dataset_id),
+                url="""https://huggingface.co/datasets/{dataset_id}""".format(
                     dataset_id=dataset_id
                 ),
             )
         ]
+        if EVALUATION["source"] == "HuggingFace Hub"
         else []
     )
     report.description = LangValue(
 def evaluate():
+    if EVALUATION["methodology"] == "GenBiT (Microsoft Gender Bias Tool)":
+        EVALUATION["sampling_size"] = min(EVALUATION["sampling_size"], 100)
     print(
+        f"Dataset          : {EVALUATION['dataset_id']}\n"
+        f"Source           : {EVALUATION['source']}\n"
+        f"Sampling Method  : {EVALUATION['sampling_method']}\n"
+        f"Sampling Size    : {EVALUATION['sampling_size']}\n"
+        f"Column           : {EVALUATION['column']}\n"
+        f"Methodology      : {EVALUATION['methodology']}\n"
         f"Time Taken       : ",
         end="",
     )
     try:
         start = timeit.default_timer()
+        data = EVALUATION["df"].copy()
+        data = data[[EVALUATION["column"]]]
+        if EVALUATION["sampling_method"] == "First":
+            data = data.head(EVALUATION["sampling_size"])
+        elif EVALUATION["sampling_method"] == "Last":
+            data = data.tail(EVALUATION["sampling_size"])
+        elif EVALUATION["sampling_method"] == "Random":
+            data = data.sample(n=EVALUATION["sampling_size"], random_state=42)
         result_df, result_plot = globals()[
+            METHODOLOGIES.get(EVALUATION["methodology"]).get("fx")
         ](data)
+        EVALUATION["result_df"] = result_df
         stop = timeit.default_timer()
 def load_dataset(local_dataset, hf_dataset):
+    if local_dataset:
+        EVALUATION["dataset_id"] = os.path.splitext(
+            os.path.basename(local_dataset.name)
+        )[0]
+        EVALUATION["source"] = "Local Dataset"
+        EVALUATION["df"] = pd.read_csv(local_dataset.name)
+    else:
+        EVALUATION["dataset_id"] = hf_dataset
+        EVALUATION["source"] = "HuggingFace Hub"
+        EVALUATION["df"] = hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas()
+    columns = EVALUATION["df"].select_dtypes(include=["object"]).columns.tolist()
+    column_corpus = EVALUATION["df"][columns[0]].tolist()[:5]
     dataset_sampling_method = gr.Radio(
         label="Scope",
     dataset_sampling_size = gr.Slider(
         label=f"Number of Entries",
+        info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {SAMPLING_SIZE_THRESHOLD}.",
         minimum=1,
+        maximum=min(EVALUATION["df"].shape[0], SAMPLING_SIZE_THRESHOLD),
+        value=min(EVALUATION["df"].shape[0], SAMPLING_SIZE_THRESHOLD),
         visible=True,
         interactive=True,
     )
     )
 def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
+    EVALUATION["sampling_method"] = dataset_sampling_method
+    EVALUATION["sampling_size"] = dataset_sampling_size
+    EVALUATION["column"] = dataset_column
     return (
         gr.Markdown(
+            "## Results (Dataset: {}{}) (Methodology: {}{})".format(
+                "\u2705" if EVALUATION["dataset_id"] else "\u274E",
+                "",
+                "\u2705" if EVALUATION["methodology"] else "\u274E",
+                "",
+            )
         ),
         gr.Button(
             value="Evaluate",
+            interactive=(
+                True
+                if EVALUATION["dataset_id"] and EVALUATION["methodology"]
+                else False
+            ),
             variant="primary",
             visible=True,
         ),
 def import_methodology(methodology):
+    EVALUATION["methodology"] = methodology
     return (
         gr.Markdown(
+            "## Results (Dataset: {}{}) (Methodology: {}{})".format(
+                "\u2705" if EVALUATION["dataset_id"] else "\u274E",
+                "",
+                "\u2705" if EVALUATION["methodology"] else "\u274E",
+                "",
+            )
         ),
         gr.Markdown(
             METHODOLOGIES[methodology]["description"],
         ),
         gr.Button(
             value="Evaluate",
+            interactive=(
+                True
+                if EVALUATION["dataset_id"] and EVALUATION["methodology"]
+                else False
+            ),
             variant="primary",
             visible=True,
         ),
             methodology_description = gr.Markdown(visible=False)
         with gr.Column(scale=2):
+            result_title = gr.Markdown(
+                "## Results (Dataset: \u274E) (Methodology: \u274E)"
+            )
             evaluation_btn = gr.Button(
                 value="Evaluate",
             gr.Textbox(
                 label="HuggingFace Hub",
                 placeholder="Search for a dataset",
+                value="imdb",
                 interactive=True,
                 visible=True,
             )
     )
     hf_dataset.submit(
+        fn=lambda _: gr.Button(
+            value=f"Load",
+            interactive=True,
+            variant="secondary",
+            visible=True,
+        ),
         inputs=[hf_dataset],
         outputs=[dataset_load_btn],
     )
     dataset_column.input(
         fn=lambda column: gr.Dataframe(
             value=pd.DataFrame(
+                {f"{column}": EVALUATION["df"][column].tolist()[:5]},
             ),
             visible=True,
         ),
 if __name__ == "__main__":
     BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()
+if __name__ == "__main__":
+    BiasAware.launch()