asoria HF staff commited on
Commit
810f00f
·
1 Parent(s): f327376

Try to generate commands from InferenceClient call

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +169 -25
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Auto Dataset Analyst Creator
3
  emoji: 🏢
4
  colorFrom: gray
5
  colorTo: indigo
 
1
  ---
2
+ title: Dataset automatic notebook creator
3
  emoji: 🏢
4
  colorFrom: gray
5
  colorTo: indigo
app.py CHANGED
@@ -4,41 +4,120 @@ import nbformat as nbf
4
  from huggingface_hub import HfApi
5
  from httpx import Client
6
  import logging
7
-
 
 
8
 
9
  """
10
  TODOs:
11
- - Add more commands to the notebook
 
 
 
 
 
12
  - Parametrize the commands (Move to another file)
13
- - Let user choose the framework and get if from /compatible-libraries
14
  - Use an LLM to suggest commands by column types
15
- - Add commands for auto training
 
 
 
 
16
  - Enable 'generate notebook' button only if dataset is available and supports library
 
17
  """
18
 
19
  # Configuration
20
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
21
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
22
  client = Client(headers=HEADERS)
 
23
 
24
  logging.basicConfig(level=logging.INFO)
25
 
26
 
27
  def get_compatible_libraries(dataset: str):
28
- try:
29
  resp = client.get(
30
  f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
31
  )
32
  resp.raise_for_status()
33
  return resp.json()
34
- except Exception as err:
35
- logging.error(f"Failed to fetch compatible libraries: {err}")
36
- return None
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def create_notebook_file(cell_commands, notebook_name):
40
  nb = nbf.v4.new_notebook()
41
- nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]
 
42
 
43
  with open(notebook_name, "w") as f:
44
  nbf.write(nb, f)
@@ -64,35 +143,100 @@ def push_notebook(file_path, dataset_id, token):
64
  logging.error(f"Failed to push notebook: {err}")
65
  return gr.HTML(value="Failed to push notebook", visible=True)
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def generate_notebook(dataset_id):
69
- first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
70
- libraries = get_compatible_libraries(dataset_id)
 
 
 
 
 
 
 
 
71
 
72
  if not libraries:
 
 
73
  return gr.File(visible=False), gr.Row.update(visible=False)
74
 
75
  pandas_library = next(
76
  (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
77
  None,
78
  )
79
- if pandas_library:
80
- first_code = pandas_library["loading_codes"][0]["code"]
81
- else:
82
  return gr.File(visible=False), gr.Row.update(visible=False)
83
 
 
 
 
 
 
 
 
 
84
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
85
- commands = [
86
- "!pip install pandas",
87
- first_code,
88
- "df.head()",
89
- f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
90
- "print(df.shape)",
91
- "df.columns",
92
- "df.describe()",
93
- "df.info()",
94
- # TODO: Generate more commands according to column types for EDA and then for auto training?
95
- ]
 
 
 
 
 
 
96
  notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
97
  create_notebook_file(commands, notebook_name=notebook_name)
98
  return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
 
4
  from huggingface_hub import HfApi
5
  from httpx import Client
6
  import logging
7
+ from huggingface_hub import InferenceClient
8
+ import json
9
+ import re
10
 
11
  """
12
  TODOs:
13
+ - Refactor
14
+ - Make the notebook generation more dynamic, add loading components to do not freeze the UI
15
+ - Fix errors:
16
+ - When generating output
17
+ - When parsing output
18
+ - When pushing notebook
19
  - Parametrize the commands (Move to another file)
 
20
  - Use an LLM to suggest commands by column types
21
+ - Add target tasks to choose for the notebook:
22
+ - Exploratory data analysis
23
+ - Auto training
24
+ - RAG
25
+ - etc.
26
  - Enable 'generate notebook' button only if dataset is available and supports library
27
+ - First get compatible-libraries and let user choose the library
28
  """
29
 
30
  # Configuration
31
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
32
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
33
  client = Client(headers=HEADERS)
34
+ inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
35
 
36
  logging.basicConfig(level=logging.INFO)
37
 
38
 
39
  def get_compatible_libraries(dataset: str):
 
40
  resp = client.get(
41
  f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
42
  )
43
  resp.raise_for_status()
44
  return resp.json()
 
 
 
45
 
46
+ import pandas as pd
47
+
48
+ def generate_eda_prompt(columns_info, df, first_code):
49
+ # columns_info = df.dtypes.to_dict()
50
+ sample_data = df.head(5).to_dict(orient='records')
51
+ # prompt = (
52
+ # "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. "
53
+ # "The data is provided as a pandas DataFrame with the following structure:\n\n"
54
+ # f"Columns and Data Types:\n{columns_info}\n\n"
55
+ # f"Sample Data:\n{sample_data}\n\n"
56
+ # "Please create a pandas EDA notebook that includes the following:\n"
57
+ # "1. Summary statistics for numerical columns.\n"
58
+ # "2. Distribution plots for numerical columns.\n"
59
+ # "3. Bar plots or count plots for categorical columns.\n"
60
+ # "4. Correlation matrix and heatmap for numerical columns.\n"
61
+ # "5. Any other relevant visualizations or analyses you deem appropriate.\n\n"
62
+ # "Ensure the notebook is well-organized, with explanations for each step."
63
+ # f"You can use the following code to load the dataset:\n\n{first_code}\n"
64
+ # """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n"
65
+ # ```json
66
+ # [
67
+ # {
68
+ # "cell_type": string // This refers either is a markdown or code cell type.
69
+ # "source": list of string // This is the list of text or python code.
70
+ # }
71
+ # ]
72
+ # ```
73
+ # Do not include more information than necessary, as this will be used to generate the notebook.
74
+ # """
75
+ # )
76
+ format_instructions = """
77
+ The output should be a markdown code snippet formatted in the
78
+ following schema, including the leading and trailing "```json" and "```":
79
+
80
+ ```json
81
+ [
82
+ {
83
+ "cell_type": string // This refers either is a markdown or code cell type.
84
+ "source": list of string // This is the list of text or python code.
85
+ }
86
+ ]
87
+ ```
88
+ """
89
+
90
+ prompt = """
91
+ You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
92
+
93
+ Columns and Data Types:
94
+ {columns_info}
95
+
96
+ Sample Data:
97
+ {sample_data}
98
+
99
+ Please create a pandas EDA notebook that includes the following:
100
+
101
+ 1. Summary statistics for numerical columns.
102
+ 2. Distribution plots for numerical columns.
103
+ 3. Bar plots or count plots for categorical columns.
104
+ 4. Correlation matrix and heatmap for numerical columns.
105
+ 5. Any additional relevant visualizations or analyses you deem appropriate.
106
+
107
+ Ensure the notebook is well-organized, with explanations for each step.
108
+
109
+ It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
110
+
111
+ {first_code}
112
+
113
+ {format_instructions}
114
+ """
115
+ return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions)
116
 
117
  def create_notebook_file(cell_commands, notebook_name):
118
  nb = nbf.v4.new_notebook()
119
+ nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands]
120
+
121
 
122
  with open(notebook_name, "w") as f:
123
  nbf.write(nb, f)
 
143
  logging.error(f"Failed to push notebook: {err}")
144
  return gr.HTML(value="Failed to push notebook", visible=True)
145
 
146
+ def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
147
+ resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
148
+ resp.raise_for_status()
149
+ content = resp.json()
150
+ rows = content["rows"]
151
+ rows = [row['row'] for row in rows]
152
+ first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
153
+ features = content['features']
154
+ features_dict = {feature['name']: feature['type'] for feature in features}
155
+ return features_dict, first_rows_df
156
+
157
+
158
+ def content_from_output(output):
159
+ pattern = r'`json(.*?)`'
160
+ logging.info("--------> Getting data from output")
161
+ match = re.search(pattern, output, re.DOTALL)
162
+ if not match:
163
+ pattern = r'```(.*?)```'
164
+ logging.info("--------> Getting data from output, second try")
165
+ match = re.search(pattern, output, re.DOTALL)
166
+ if not match:
167
+ raise Exception("Unable to generate jupyter notebook.")
168
+ extracted_text = match.group(1)
169
+ logging.info(extracted_text)
170
+
171
+
172
+ def get_notebook_cells(prompt):
173
+ messages = [{"role": "user", "content": prompt}]
174
+ output = inference_client.chat_completion(messages=messages, max_tokens=2500)
175
+ output = (output.choices[0].message.content)
176
+ logging.info(output)
177
+ pattern = r'`json(.*?)`'
178
+ logging.info("--------> Getting data from output")
179
+ match = re.search(pattern, output, re.DOTALL)
180
+ if not match:
181
+ raise Exception("Unable to generate jupyter notebook.")
182
+ extracted_text = match.group(1)
183
+ logging.info(extracted_text)
184
+ content = json.loads(extracted_text)
185
+ logging.info(content)
186
+ return content
187
 
188
  def generate_notebook(dataset_id):
189
+
190
+ #TODO: Load dataframe from notebook here
191
+ # generate_eda_prompt
192
+
193
+ try:
194
+ libraries = get_compatible_libraries(dataset_id)
195
+ except Exception as err:
196
+ gr.Error('Unable to retrieve dataset info from HF Hub.')
197
+ logging.error(f"Failed to fetch compatible libraries: {err}")
198
+ return None
199
 
200
  if not libraries:
201
+ gr.Warning('Dataset not compatible with pandas library.')
202
+ logging.error(f"Dataset not compatible with pandas library")
203
  return gr.File(visible=False), gr.Row.update(visible=False)
204
 
205
  pandas_library = next(
206
  (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
207
  None,
208
  )
209
+ if not pandas_library:
210
+ gr.Warning('Dataset not compatible with pandas library.')
211
+ logging.error(f"Dataset not compatible with pandas library")
212
  return gr.File(visible=False), gr.Row.update(visible=False)
213
 
214
+ first_config_loading_code = pandas_library['loading_codes'][0]
215
+ first_code = first_config_loading_code['code']
216
+
217
+ first_config = first_config_loading_code['config_name']
218
+ first_split = list(first_config_loading_code['arguments']['splits'].keys())[0]
219
+ logging.info(f"First config: {first_config} - first split: {first_split}")
220
+ first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
221
+ logging.info(f"First split file: {first_file}")
222
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
223
+ features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
224
+ prompt = generate_eda_prompt(features, df, first_code)
225
+ logging.info(f"Prompt: {prompt}")
226
+ commands = get_notebook_cells(prompt)
227
+ # TODO: Generate this commands using InferenceClient
228
+ # commands = [
229
+ # "!pip install pandas",
230
+ # "import pandas as pd"
231
+ # f"df = pd.read_parquet('{first_file}')",
232
+ # "df.head()",
233
+ # f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
234
+ # "print(df.shape)",
235
+ # "df.columns",
236
+ # "df.describe()",
237
+ # "df.info()",
238
+ # # TODO: Generate more commands according to column types for EDA and then for auto training?
239
+ # ]
240
  notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
241
  create_notebook_file(commands, notebook_name=notebook_name)
242
  return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)