davidberenstein1957 HF staff commited on
Commit
6bd9538
1 Parent(s): 27f8ecd

Update filters and has_pipeline explorer

Browse files
Files changed (1) hide show
  1. app.py +41 -33
app.py CHANGED
@@ -1,11 +1,10 @@
1
- import asyncio
2
  import urllib
3
  from typing import Iterable
4
 
5
  import gradio as gr
6
  import markdown as md
7
  import pandas as pd
8
- from distilabel.cli.pipeline.utils import _build_pipeline_panel, get_pipeline
9
  from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
10
  from gradio_modal import Modal
11
  from huggingface_hub import HfApi, HfFileSystem, RepoCard
@@ -27,7 +26,7 @@ def _categorize_dtypes(df):
27
  }
28
 
29
  categorized_dtypes = []
30
- for column, dtype in df.dtypes.items():
31
  dtype_str = str(dtype)
32
  if dtype_str in dtype_mapping:
33
  categorized_dtypes.append(dtype_mapping[dtype_str])
@@ -42,22 +41,18 @@ def _get_tag_category(entry: list[str], tag_category: str):
42
  else:
43
  return None
44
 
45
- def _has_pipeline(repo_id):
46
- file_path = f"datasets/{repo_id}/pipeline.log"
47
- url = "https://huggingface.co/{file_path}"
48
- if fs.exists(file_path):
49
- pipeline = get_pipeline(url)
50
- return str(_build_pipeline_panel(pipeline))
51
- else:
52
- return ""
53
-
54
-
55
-
56
- async def check_pipelines(repo_ids):
57
- tasks = [_has_pipeline(fs, repo_id) for repo_id in repo_ids]
58
- results = await asyncio.gather(*tasks)
59
 
60
- return dict(zip(repo_ids, results))
 
 
 
 
61
 
62
  def _search_distilabel_repos(query: str = None,):
63
  filter = "library:distilabel"
@@ -67,8 +62,7 @@ def _search_distilabel_repos(query: str = None,):
67
  data = [ex.__dict__ for ex in datasets]
68
  df = pd.DataFrame.from_records(data)
69
  df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
70
- # df["has_pipeline"] = asyncio.run(check_pipelines(df.id.tolist()))
71
- df["has_pipeline"] = ""
72
  subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
73
  new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
74
  df = df[new_column_order]
@@ -87,17 +81,18 @@ def _create_modal_info(row: dict) -> str:
87
 
88
  return "<br>".join([
89
  _get_main_title(repo_id=row["id"]),
90
- f'pipeline available: {_has_pipeline(repo_id=row["id"])}',
91
  _embed_dataset_viewer(repo_id=row["id"]),
92
  _get_dataset_card(repo_id=row["id"]),
 
 
 
93
  ])
94
 
95
  # Define the Gradio interface
96
  with gr.Blocks(delete_cache=[1,1]) as demo:
97
  gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
98
- gr.HTML("Select a dataset to show the pipeline, dataset viewer and model card.")
99
  df: pd.DataFrame = _search_distilabel_repos()
100
-
101
  leader_board = Leaderboard(
102
  value=df,
103
  datatype=_categorize_dtypes(df),
@@ -108,30 +103,43 @@ with gr.Blocks(delete_cache=[1,1]) as demo:
108
  ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
109
  ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
110
  ColumnFilter("size_categories", type="checkboxgroup"),
111
- ColumnFilter("has_pipeline", type="checkboxgroup"),
112
  ],
113
  hide_columns=[
114
  "_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
115
  "cardData", "lastModified", "card_data", "key"],
116
- select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "size_categories"],
117
  cant_deselect=["id"],
118
  label="Select The Columns",
119
  info="Helpful information"),
120
  )
121
 
122
  with Modal() as modal:
123
- markdown = gr.HTML(value="test")
 
 
 
124
 
125
- def update(leader_board, markdown, evt: gr.SelectData):
126
  if not isinstance(evt.index, int):
127
- index = evt.index[0] # Assuming evt.index is a list or similar structure
128
- markdown = _create_modal_info(row=leader_board.iloc[index].to_dict())
129
- modal = Modal(visible=True)
130
- return leader_board, markdown, modal
 
 
 
 
131
  else:
132
- return leader_board, markdown
 
 
 
 
 
 
 
133
 
134
- leader_board.select(update, [leader_board, markdown], [leader_board, markdown, modal], show_progress="hidden")
135
 
136
 
137
  if __name__ == "__main__":
 
 
1
  import urllib
2
  from typing import Iterable
3
 
4
  import gradio as gr
5
  import markdown as md
6
  import pandas as pd
7
+ from distilabel.cli.pipeline.utils import get_config_from_url
8
  from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
9
  from gradio_modal import Modal
10
  from huggingface_hub import HfApi, HfFileSystem, RepoCard
 
26
  }
27
 
28
  categorized_dtypes = []
29
+ for _, dtype in df.dtypes.items():
30
  dtype_str = str(dtype)
31
  if dtype_str in dtype_mapping:
32
  categorized_dtypes.append(dtype_mapping[dtype_str])
 
41
  else:
42
  return None
43
 
44
+ def _check_pipeline(repo_id):
45
+ for file_type in [".json", ".yaml", ".yml"]:
46
+ file_path = f"datasets/{repo_id}/pipeline{file_type}"
47
+ url = f"https://huggingface.co/{file_path}"
48
+ if fs.exists(file_path):
49
+ return get_config_from_url(url)
 
 
 
 
 
 
 
 
50
 
51
+ def _has_pipline(x):
52
+ if isinstance(x, str):
53
+ if "distilabel pipeline run" in x:
54
+ return "yes"
55
+ return "no"
56
 
57
  def _search_distilabel_repos(query: str = None,):
58
  filter = "library:distilabel"
 
62
  data = [ex.__dict__ for ex in datasets]
63
  df = pd.DataFrame.from_records(data)
64
  df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
65
+ df["has_pipeline"] = df.description.apply(_has_pipline)
 
66
  subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
67
  new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
68
  df = df[new_column_order]
 
81
 
82
  return "<br>".join([
83
  _get_main_title(repo_id=row["id"]),
 
84
  _embed_dataset_viewer(repo_id=row["id"]),
85
  _get_dataset_card(repo_id=row["id"]),
86
+ ]), "<br>".join([
87
+ _get_main_title(repo_id=row["id"]),
88
+ f'pipeline available: {_check_pipeline(repo_id=row["id"])}'
89
  ])
90
 
91
  # Define the Gradio interface
92
  with gr.Blocks(delete_cache=[1,1]) as demo:
93
  gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
94
+ gr.HTML("Select a repo_id to show the pipeline, dataset viewer and model card.")
95
  df: pd.DataFrame = _search_distilabel_repos()
 
96
  leader_board = Leaderboard(
97
  value=df,
98
  datatype=_categorize_dtypes(df),
 
103
  ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
104
  ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
105
  ColumnFilter("size_categories", type="checkboxgroup"),
106
+ ColumnFilter("has_pipeline", type="dropdown"),
107
  ],
108
  hide_columns=[
109
  "_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
110
  "cardData", "lastModified", "card_data", "key"],
111
+ select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "has_pipeline", "size_categories"],
112
  cant_deselect=["id"],
113
  label="Select The Columns",
114
  info="Helpful information"),
115
  )
116
 
117
  with Modal() as modal:
118
+ with gr.Tab(label="dataset"):
119
+ markdown_1 = gr.HTML()
120
+ with gr.Tab(label="pipeline"):
121
+ markdown_2 = gr.HTML()
122
 
123
+ def update(leader_board: pd.DataFrame, markdown_1, markdown_2, evt: gr.SelectData):
124
  if not isinstance(evt.index, int):
125
+ leader_board_filtered = leader_board[leader_board["id"] == evt.value]
126
+ if leader_board_filtered.empty:
127
+ gr.Info("Press a cell with the repo id.")
128
+ else:
129
+ row = leader_board_filtered.iloc[0].to_dict()
130
+ markdown_1, markdown_2 = _create_modal_info(row=row)
131
+ modal = Modal(visible=True)
132
+ return leader_board, markdown_1, markdown_2, modal
133
  else:
134
+ modal = Modal(visible=False)
135
+ return leader_board, markdown_1, markdown_2, modal
136
+
137
+ leader_board.select(
138
+ update,
139
+ [leader_board, markdown_1, markdown_2],
140
+ [leader_board, markdown_1, markdown_2, modal]
141
+ )
142
 
 
143
 
144
 
145
  if __name__ == "__main__":