Commit
•
6bd9538
1
Parent(s):
27f8ecd
Update filters and has_pipeline explorer
Browse files
app.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
-
import asyncio
|
2 |
import urllib
|
3 |
from typing import Iterable
|
4 |
|
5 |
import gradio as gr
|
6 |
import markdown as md
|
7 |
import pandas as pd
|
8 |
-
from distilabel.cli.pipeline.utils import
|
9 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
|
10 |
from gradio_modal import Modal
|
11 |
from huggingface_hub import HfApi, HfFileSystem, RepoCard
|
@@ -27,7 +26,7 @@ def _categorize_dtypes(df):
|
|
27 |
}
|
28 |
|
29 |
categorized_dtypes = []
|
30 |
-
for
|
31 |
dtype_str = str(dtype)
|
32 |
if dtype_str in dtype_mapping:
|
33 |
categorized_dtypes.append(dtype_mapping[dtype_str])
|
@@ -42,22 +41,18 @@ def _get_tag_category(entry: list[str], tag_category: str):
|
|
42 |
else:
|
43 |
return None
|
44 |
|
45 |
-
def
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
else:
|
52 |
-
return ""
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
async def check_pipelines(repo_ids):
|
57 |
-
tasks = [_has_pipeline(fs, repo_id) for repo_id in repo_ids]
|
58 |
-
results = await asyncio.gather(*tasks)
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def _search_distilabel_repos(query: str = None,):
|
63 |
filter = "library:distilabel"
|
@@ -67,8 +62,7 @@ def _search_distilabel_repos(query: str = None,):
|
|
67 |
data = [ex.__dict__ for ex in datasets]
|
68 |
df = pd.DataFrame.from_records(data)
|
69 |
df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
|
70 |
-
|
71 |
-
df["has_pipeline"] = ""
|
72 |
subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
|
73 |
new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
|
74 |
df = df[new_column_order]
|
@@ -87,17 +81,18 @@ def _create_modal_info(row: dict) -> str:
|
|
87 |
|
88 |
return "<br>".join([
|
89 |
_get_main_title(repo_id=row["id"]),
|
90 |
-
f'pipeline available: {_has_pipeline(repo_id=row["id"])}',
|
91 |
_embed_dataset_viewer(repo_id=row["id"]),
|
92 |
_get_dataset_card(repo_id=row["id"]),
|
|
|
|
|
|
|
93 |
])
|
94 |
|
95 |
# Define the Gradio interface
|
96 |
with gr.Blocks(delete_cache=[1,1]) as demo:
|
97 |
gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
|
98 |
-
gr.HTML("Select a
|
99 |
df: pd.DataFrame = _search_distilabel_repos()
|
100 |
-
|
101 |
leader_board = Leaderboard(
|
102 |
value=df,
|
103 |
datatype=_categorize_dtypes(df),
|
@@ -108,30 +103,43 @@ with gr.Blocks(delete_cache=[1,1]) as demo:
|
|
108 |
ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
|
109 |
ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
|
110 |
ColumnFilter("size_categories", type="checkboxgroup"),
|
111 |
-
ColumnFilter("has_pipeline", type="
|
112 |
],
|
113 |
hide_columns=[
|
114 |
"_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
|
115 |
"cardData", "lastModified", "card_data", "key"],
|
116 |
-
select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "size_categories"],
|
117 |
cant_deselect=["id"],
|
118 |
label="Select The Columns",
|
119 |
info="Helpful information"),
|
120 |
)
|
121 |
|
122 |
with Modal() as modal:
|
123 |
-
|
|
|
|
|
|
|
124 |
|
125 |
-
def update(leader_board,
|
126 |
if not isinstance(evt.index, int):
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
131 |
else:
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
-
leader_board.select(update, [leader_board, markdown], [leader_board, markdown, modal], show_progress="hidden")
|
135 |
|
136 |
|
137 |
if __name__ == "__main__":
|
|
|
|
|
1 |
import urllib
|
2 |
from typing import Iterable
|
3 |
|
4 |
import gradio as gr
|
5 |
import markdown as md
|
6 |
import pandas as pd
|
7 |
+
from distilabel.cli.pipeline.utils import get_config_from_url
|
8 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
|
9 |
from gradio_modal import Modal
|
10 |
from huggingface_hub import HfApi, HfFileSystem, RepoCard
|
|
|
26 |
}
|
27 |
|
28 |
categorized_dtypes = []
|
29 |
+
for _, dtype in df.dtypes.items():
|
30 |
dtype_str = str(dtype)
|
31 |
if dtype_str in dtype_mapping:
|
32 |
categorized_dtypes.append(dtype_mapping[dtype_str])
|
|
|
41 |
else:
|
42 |
return None
|
43 |
|
44 |
+
def _check_pipeline(repo_id):
|
45 |
+
for file_type in [".json", ".yaml", ".yml"]:
|
46 |
+
file_path = f"datasets/{repo_id}/pipeline{file_type}"
|
47 |
+
url = f"https://huggingface.co/{file_path}"
|
48 |
+
if fs.exists(file_path):
|
49 |
+
return get_config_from_url(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
def _has_pipline(x):
|
52 |
+
if isinstance(x, str):
|
53 |
+
if "distilabel pipeline run" in x:
|
54 |
+
return "yes"
|
55 |
+
return "no"
|
56 |
|
57 |
def _search_distilabel_repos(query: str = None,):
|
58 |
filter = "library:distilabel"
|
|
|
62 |
data = [ex.__dict__ for ex in datasets]
|
63 |
df = pd.DataFrame.from_records(data)
|
64 |
df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
|
65 |
+
df["has_pipeline"] = df.description.apply(_has_pipline)
|
|
|
66 |
subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
|
67 |
new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
|
68 |
df = df[new_column_order]
|
|
|
81 |
|
82 |
return "<br>".join([
|
83 |
_get_main_title(repo_id=row["id"]),
|
|
|
84 |
_embed_dataset_viewer(repo_id=row["id"]),
|
85 |
_get_dataset_card(repo_id=row["id"]),
|
86 |
+
]), "<br>".join([
|
87 |
+
_get_main_title(repo_id=row["id"]),
|
88 |
+
f'pipeline available: {_check_pipeline(repo_id=row["id"])}'
|
89 |
])
|
90 |
|
91 |
# Define the Gradio interface
|
92 |
with gr.Blocks(delete_cache=[1,1]) as demo:
|
93 |
gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
|
94 |
+
gr.HTML("Select a repo_id to show the pipeline, dataset viewer and model card.")
|
95 |
df: pd.DataFrame = _search_distilabel_repos()
|
|
|
96 |
leader_board = Leaderboard(
|
97 |
value=df,
|
98 |
datatype=_categorize_dtypes(df),
|
|
|
103 |
ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
|
104 |
ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
|
105 |
ColumnFilter("size_categories", type="checkboxgroup"),
|
106 |
+
ColumnFilter("has_pipeline", type="dropdown"),
|
107 |
],
|
108 |
hide_columns=[
|
109 |
"_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
|
110 |
"cardData", "lastModified", "card_data", "key"],
|
111 |
+
select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "has_pipeline", "size_categories"],
|
112 |
cant_deselect=["id"],
|
113 |
label="Select The Columns",
|
114 |
info="Helpful information"),
|
115 |
)
|
116 |
|
117 |
with Modal() as modal:
|
118 |
+
with gr.Tab(label="dataset"):
|
119 |
+
markdown_1 = gr.HTML()
|
120 |
+
with gr.Tab(label="pipeline"):
|
121 |
+
markdown_2 = gr.HTML()
|
122 |
|
123 |
+
def update(leader_board: pd.DataFrame, markdown_1, markdown_2, evt: gr.SelectData):
|
124 |
if not isinstance(evt.index, int):
|
125 |
+
leader_board_filtered = leader_board[leader_board["id"] == evt.value]
|
126 |
+
if leader_board_filtered.empty:
|
127 |
+
gr.Info("Press a cell with the repo id.")
|
128 |
+
else:
|
129 |
+
row = leader_board_filtered.iloc[0].to_dict()
|
130 |
+
markdown_1, markdown_2 = _create_modal_info(row=row)
|
131 |
+
modal = Modal(visible=True)
|
132 |
+
return leader_board, markdown_1, markdown_2, modal
|
133 |
else:
|
134 |
+
modal = Modal(visible=False)
|
135 |
+
return leader_board, markdown_1, markdown_2, modal
|
136 |
+
|
137 |
+
leader_board.select(
|
138 |
+
update,
|
139 |
+
[leader_board, markdown_1, markdown_2],
|
140 |
+
[leader_board, markdown_1, markdown_2, modal]
|
141 |
+
)
|
142 |
|
|
|
143 |
|
144 |
|
145 |
if __name__ == "__main__":
|