asoria HF staff commited on
Commit
45f97ba
·
1 Parent(s): f5da21f
Files changed (1) hide show
  1. app.py +80 -64
app.py CHANGED
@@ -44,20 +44,24 @@ logging.basicConfig(level=logging.INFO)
44
 
45
 
46
  def get_compatible_libraries(dataset: str):
47
- resp = client.get(
48
- f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
49
- )
50
- resp.raise_for_status()
51
- return resp.json()
 
 
 
 
52
 
53
 
54
  def create_notebook_file(cell_commands, notebook_name):
55
  nb = nbf.v4.new_notebook()
56
  nb["cells"] = [
57
- nbf.v4.new_code_cell(command["source"])
58
- if command["cell_type"] == "code"
59
- else nbf.v4.new_markdown_cell(command["source"])
60
- for command in cell_commands
61
  ]
62
 
63
  with open(notebook_name, "w") as f:
@@ -65,45 +69,51 @@ def create_notebook_file(cell_commands, notebook_name):
65
  logging.info(f"Notebook {notebook_name} created successfully")
66
 
67
 
68
- def push_notebook(file_path, dataset_id, token):
69
- notebook_name = "dataset_analysis.ipynb"
70
- api = HfApi(token=token)
71
  try:
72
- api.upload_file(
73
- path_or_fileobj=file_path,
74
- path_in_repo=notebook_name,
75
- repo_id=dataset_id,
76
- repo_type="dataset",
77
- )
78
- link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
79
- return gr.HTML(
80
- value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>',
81
- visible=True,
82
  )
83
- except Exception as err:
84
- logging.error(f"Failed to push notebook: {err}")
85
- return gr.HTML(value="Failed to push notebook", visible=True)
 
 
 
 
 
 
 
 
86
 
87
 
88
- def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
89
- resp = client.get(
90
- f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
91
- )
92
- resp.raise_for_status()
93
- content = resp.json()
94
- rows = content["rows"]
95
- rows = [row["row"] for row in rows]
96
- first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
97
- features = content["features"]
98
- features_dict = {feature["name"]: feature["type"] for feature in features}
99
- return features_dict, first_rows_df
100
 
101
 
102
- def get_txt_from_output(output):
103
- extracted_text = content_from_output(output)
104
- content = json.loads(extracted_text)
105
- logging.info(content)
106
- return content
 
 
 
 
 
 
 
 
 
 
107
 
108
 
109
  def content_from_output(output):
@@ -123,18 +133,26 @@ def content_from_output(output):
123
  return match.group(1)
124
 
125
 
126
- def generate_eda_cells(dataset_id):
127
- for messages in generate_cells(dataset_id, generate_eda_prompt):
128
  yield messages, gr.update(visible=False), None # Keep button hidden
129
 
130
- yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
 
 
 
 
131
 
132
 
133
- def generate_embedding_cells(dataset_id):
134
- for messages in generate_cells(dataset_id, generate_embedding_prompt):
135
  yield messages, gr.update(visible=False), None # Keep button hidden
136
 
137
- yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
 
 
 
 
138
 
139
 
140
  def push_to_hub(
@@ -149,6 +167,7 @@ def push_to_hub(
149
  yield history + [
150
  gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
151
  ]
 
152
  logging.info(f"Profile: {profile}, token: {oauth_token.token}")
153
 
154
  notebook_name = "dataset_analysis.ipynb"
@@ -165,15 +184,16 @@ def push_to_hub(
165
  logging.info(f"Notebook pushed to hub: {link}")
166
  yield history + [
167
  gr.ChatMessage(
168
- role="assistant", content=f"[Here is the generated notebook]({link})"
 
169
  )
170
  ]
171
- except Exception as err:
172
- logging.info("Failed to push notebook", err)
173
- yield history + [gr.ChatMessage(role="assistant", content=err)]
174
 
175
 
176
- def generate_cells(dataset_id, prompt_fn):
177
  try:
178
  libraries = get_compatible_libraries(dataset_id)
179
  except Exception as err:
@@ -198,12 +218,8 @@ def generate_cells(dataset_id, prompt_fn):
198
  first_code = first_config_loading_code["code"]
199
  first_config = first_config_loading_code["config_name"]
200
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
201
- logging.info(f"First config: {first_config} - first split: {first_split}")
202
- first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
203
- logging.info(f"First split file: {first_file}")
204
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
205
- sample_data = df.head(5).to_dict(orient="records")
206
- prompt = prompt_fn(features, sample_data, first_code)
207
  messages = [gr.ChatMessage(role="user", content=prompt)]
208
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
209
 
@@ -240,7 +256,7 @@ def generate_cells(dataset_id, prompt_fn):
240
 
241
  commands = get_txt_from_output(cells_txt)
242
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
243
- # Adding dataset viewer on the first part
244
  commands.insert(
245
  0,
246
  {
@@ -249,10 +265,10 @@ def generate_cells(dataset_id, prompt_fn):
249
  },
250
  )
251
  commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
252
- notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
253
  create_notebook_file(commands, notebook_name=notebook_name)
254
  messages.append(
255
- gr.ChatMessage(role="user", content="Here is the generated notebook")
256
  )
257
  yield messages
258
  messages.append(
@@ -264,8 +280,8 @@ def generate_cells(dataset_id, prompt_fn):
264
  yield messages
265
 
266
 
267
- def comming_soon_message():
268
- gr.Info("Comming soon")
269
 
270
 
271
  with gr.Blocks(fill_height=True) as demo:
@@ -322,7 +338,7 @@ with gr.Blocks(fill_height=True) as demo:
322
  outputs=[chatbot, push_btn, notebook_file],
323
  )
324
 
325
- generate_training_btn.click(comming_soon_message, inputs=[], outputs=[])
326
  push_btn.click(
327
  push_to_hub,
328
  inputs=[
 
44
 
45
 
46
  def get_compatible_libraries(dataset: str):
47
+ try:
48
+ response = client.get(
49
+ f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
50
+ )
51
+ response.raise_for_status()
52
+ return response.json()
53
+ except Exception as e:
54
+ logging.error(f"Error fetching compatible libraries: {e}")
55
+ raise
56
 
57
 
58
  def create_notebook_file(cell_commands, notebook_name):
59
  nb = nbf.v4.new_notebook()
60
  nb["cells"] = [
61
+ nbf.v4.new_code_cell(cmd["source"])
62
+ if cmd["cell_type"] == "code"
63
+ else nbf.v4.new_markdown_cell(cmd["source"])
64
+ for cmd in cell_commands
65
  ]
66
 
67
  with open(notebook_name, "w") as f:
 
69
  logging.info(f"Notebook {notebook_name} created successfully")
70
 
71
 
72
+ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
 
 
73
  try:
74
+ resp = client.get(
75
+ f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
 
 
 
 
 
 
 
 
76
  )
77
+ resp.raise_for_status()
78
+ content = resp.json()
79
+ rows = content["rows"]
80
+ rows = [row["row"] for row in rows]
81
+ first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
82
+ features = content["features"]
83
+ features_dict = {feature["name"]: feature["type"] for feature in features}
84
+ return features_dict, first_rows_df
85
+ except Exception as e:
86
+ logging.error(f"Error fetching first rows: {e}")
87
+ raise
88
 
89
 
90
+ def get_txt_from_output(output):
91
+ try:
92
+ extracted_text = extract_content_from_output(output)
93
+ content = json.loads(extracted_text)
94
+ logging.info(content)
95
+ return content
96
+ except Exception as e:
97
+ gr.Error("Error when parsing notebook, try again.")
98
+ logging.error(f"Failed to fetch compatible libraries: {e}")
99
+ raise
 
 
100
 
101
 
102
+ def extract_content_from_output(output):
103
+ patterns = [r"`json(.*?)`", r"```(.*?)```"]
104
+
105
+ for pattern in patterns:
106
+ match = re.search(pattern, output, re.DOTALL)
107
+ if match:
108
+ return match.group(1)
109
+
110
+ try:
111
+ index = output.index("```json")
112
+ logging.info(f"Index: {index}")
113
+ return output[index + 7 :]
114
+ except ValueError:
115
+ logging.error("Unable to generate Jupyter notebook.")
116
+ raise
117
 
118
 
119
  def content_from_output(output):
 
133
  return match.group(1)
134
 
135
 
136
+ def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
137
+ for messages in generate_cells(dataset_id, generate_eda_prompt, "eda"):
138
  yield messages, gr.update(visible=False), None # Keep button hidden
139
 
140
+ yield (
141
+ messages,
142
+ gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
143
+ f"{dataset_id.replace('/', '-')}-eda.ipynb",
144
+ )
145
 
146
 
147
+ def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
148
+ for messages in generate_cells(dataset_id, generate_embedding_prompt, "embedding"):
149
  yield messages, gr.update(visible=False), None # Keep button hidden
150
 
151
+ yield (
152
+ messages,
153
+ gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
154
+ f"{dataset_id.replace('/', '-')}-embedding.ipynb",
155
+ )
156
 
157
 
158
  def push_to_hub(
 
167
  yield history + [
168
  gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
169
  ]
170
+ return
171
  logging.info(f"Profile: {profile}, token: {oauth_token.token}")
172
 
173
  notebook_name = "dataset_analysis.ipynb"
 
184
  logging.info(f"Notebook pushed to hub: {link}")
185
  yield history + [
186
  gr.ChatMessage(
187
+ role="user",
188
+ content=f"[See the notebook on the Hub]({link})",
189
  )
190
  ]
191
+ except Exception as e:
192
+ logging.info("Failed to push notebook", e)
193
+ yield history + [gr.ChatMessage(role="assistant", content=e)]
194
 
195
 
196
+ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
197
  try:
198
  libraries = get_compatible_libraries(dataset_id)
199
  except Exception as err:
 
218
  first_code = first_config_loading_code["code"]
219
  first_config = first_config_loading_code["config_name"]
220
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
 
 
 
221
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
222
+ prompt = prompt_fn(features, df.head(5).to_dict(orient="records"), first_code)
 
223
  messages = [gr.ChatMessage(role="user", content=prompt)]
224
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
225
 
 
256
 
257
  commands = get_txt_from_output(cells_txt)
258
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
259
+
260
  commands.insert(
261
  0,
262
  {
 
265
  },
266
  )
267
  commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
268
+ notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
269
  create_notebook_file(commands, notebook_name=notebook_name)
270
  messages.append(
271
+ gr.ChatMessage(role="user", content="Here is the generated notebook file")
272
  )
273
  yield messages
274
  messages.append(
 
280
  yield messages
281
 
282
 
283
+ def coming_soon_message():
284
+ return gr.Info("Coming soon")
285
 
286
 
287
  with gr.Blocks(fill_height=True) as demo:
 
338
  outputs=[chatbot, push_btn, notebook_file],
339
  )
340
 
341
+ generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
342
  push_btn.click(
343
  push_to_hub,
344
  inputs=[