Spaces:

davidberenstein1957
/

distilabel-synthetic-data-pipeline-explorer

Runtime error

App Files Files Community

davidberenstein1957 commited on Jul 27, 2024

Commit

6bd9538

1 Parent(s): 27f8ecd

Update filters and has_pipeline explorer

Browse files

Files changed (1) hide show

app.py +41 -33

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import asyncio
 import urllib
 from typing import Iterable
 import gradio as gr
 import markdown as md
 import pandas as pd
-from distilabel.cli.pipeline.utils import _build_pipeline_panel, get_pipeline
 from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
 from gradio_modal import Modal
 from huggingface_hub import HfApi, HfFileSystem, RepoCard
@@ -27,7 +26,7 @@ def _categorize_dtypes(df):
     }
     categorized_dtypes = []
-    for column, dtype in df.dtypes.items():
         dtype_str = str(dtype)
         if dtype_str in dtype_mapping:
             categorized_dtypes.append(dtype_mapping[dtype_str])
@@ -42,22 +41,18 @@ def _get_tag_category(entry: list[str], tag_category: str):
     else:
         return None
-def _has_pipeline(repo_id):
-    file_path = f"datasets/{repo_id}/pipeline.log"
-    url = "https://huggingface.co/{file_path}"
-    if fs.exists(file_path):
-        pipeline = get_pipeline(url)
-        return str(_build_pipeline_panel(pipeline))
-    else:
-        return ""
-async def check_pipelines(repo_ids):
-    tasks = [_has_pipeline(fs, repo_id) for repo_id in repo_ids]
-    results = await asyncio.gather(*tasks)
-    return dict(zip(repo_ids, results))
 def _search_distilabel_repos(query: str = None,):
     filter = "library:distilabel"
@@ -67,8 +62,7 @@ def _search_distilabel_repos(query: str = None,):
     data = [ex.__dict__ for ex in datasets]
     df = pd.DataFrame.from_records(data)
     df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
-    # df["has_pipeline"] = asyncio.run(check_pipelines(df.id.tolist()))
-    df["has_pipeline"] = ""
     subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
     new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
     df = df[new_column_order]
@@ -87,17 +81,18 @@ def _create_modal_info(row: dict) -> str:
     return "<br>".join([
         _get_main_title(repo_id=row["id"]),
-        f'pipeline available: {_has_pipeline(repo_id=row["id"])}',
         _embed_dataset_viewer(repo_id=row["id"]),
         _get_dataset_card(repo_id=row["id"]),
     ])
 # Define the Gradio interface
 with gr.Blocks(delete_cache=[1,1]) as demo:
     gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
-    gr.HTML("Select a dataset to show the pipeline, dataset viewer and model card.")
     df: pd.DataFrame = _search_distilabel_repos()
     leader_board = Leaderboard(
         value=df,
         datatype=_categorize_dtypes(df),
@@ -108,30 +103,43 @@ with gr.Blocks(delete_cache=[1,1]) as demo:
             ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
             ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
             ColumnFilter("size_categories", type="checkboxgroup"),
-            ColumnFilter("has_pipeline", type="checkboxgroup"),
         ],
         hide_columns=[
             "_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
             "cardData", "lastModified", "card_data", "key"],
-        select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "size_categories"],
                                     cant_deselect=["id"],
                                     label="Select The Columns",
                                     info="Helpful information"),
     )
     with Modal() as modal:
-        markdown = gr.HTML(value="test")
-    def update(leader_board, markdown, evt: gr.SelectData):
         if not isinstance(evt.index, int):
-            index = evt.index[0]  # Assuming evt.index is a list or similar structure
-            markdown = _create_modal_info(row=leader_board.iloc[index].to_dict())
-            modal = Modal(visible=True)
-            return leader_board, markdown, modal
         else:
-            return leader_board, markdown
-    leader_board.select(update, [leader_board, markdown], [leader_board, markdown, modal], show_progress="hidden")
 if __name__ == "__main__":

 import urllib
 from typing import Iterable
 import gradio as gr
 import markdown as md
 import pandas as pd
+from distilabel.cli.pipeline.utils import get_config_from_url
 from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
 from gradio_modal import Modal
 from huggingface_hub import HfApi, HfFileSystem, RepoCard
     }
     categorized_dtypes = []
+    for _, dtype in df.dtypes.items():
         dtype_str = str(dtype)
         if dtype_str in dtype_mapping:
             categorized_dtypes.append(dtype_mapping[dtype_str])
     else:
         return None
+def _check_pipeline(repo_id):
+    for file_type in [".json", ".yaml", ".yml"]:
+        file_path = f"datasets/{repo_id}/pipeline{file_type}"
+        url = f"https://huggingface.co/{file_path}"
+        if fs.exists(file_path):
+            return get_config_from_url(url)
+def _has_pipline(x):
+    if isinstance(x, str):
+        if "distilabel pipeline run" in x:
+            return "yes"
+    return "no"
 def _search_distilabel_repos(query: str = None,):
     filter = "library:distilabel"
     data = [ex.__dict__ for ex in datasets]
     df = pd.DataFrame.from_records(data)
     df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
+    df["has_pipeline"] = df.description.apply(_has_pipline)
     subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
     new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
     df = df[new_column_order]
     return "<br>".join([
         _get_main_title(repo_id=row["id"]),
         _embed_dataset_viewer(repo_id=row["id"]),
         _get_dataset_card(repo_id=row["id"]),
+    ]), "<br>".join([
+        _get_main_title(repo_id=row["id"]),
+        f'pipeline available: {_check_pipeline(repo_id=row["id"])}'
     ])
 # Define the Gradio interface
 with gr.Blocks(delete_cache=[1,1]) as demo:
     gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
+    gr.HTML("Select a repo_id to show the pipeline, dataset viewer and model card.")
     df: pd.DataFrame = _search_distilabel_repos()
     leader_board = Leaderboard(
         value=df,
         datatype=_categorize_dtypes(df),
             ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
             ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
             ColumnFilter("size_categories", type="checkboxgroup"),
+            ColumnFilter("has_pipeline", type="dropdown"),
         ],
         hide_columns=[
             "_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
             "cardData", "lastModified", "card_data", "key"],
+        select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "has_pipeline", "size_categories"],
                                     cant_deselect=["id"],
                                     label="Select The Columns",
                                     info="Helpful information"),
     )
     with Modal() as modal:
+        with gr.Tab(label="dataset"):
+            markdown_1 = gr.HTML()
+        with gr.Tab(label="pipeline"):
+            markdown_2 = gr.HTML()
+    def update(leader_board: pd.DataFrame, markdown_1, markdown_2, evt: gr.SelectData):
         if not isinstance(evt.index, int):
+            leader_board_filtered = leader_board[leader_board["id"] == evt.value]
+            if leader_board_filtered.empty:
+                gr.Info("Press a cell with the repo id.")
+            else:
+                row = leader_board_filtered.iloc[0].to_dict()
+                markdown_1, markdown_2 = _create_modal_info(row=row)
+                modal = Modal(visible=True)
+            return leader_board, markdown_1, markdown_2, modal
         else:
+            modal = Modal(visible=False)
+            return leader_board, markdown_1, markdown_2, modal
+    leader_board.select(
+        update,
+        [leader_board, markdown_1, markdown_2],
+        [leader_board, markdown_1, markdown_2, modal]
+    )
 if __name__ == "__main__":