Spaces:
Sleeping
Sleeping
Try to generate commands from InferenceClient call
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🏢
|
4 |
colorFrom: gray
|
5 |
colorTo: indigo
|
|
|
1 |
---
|
2 |
+
title: Dataset automatic notebook creator
|
3 |
emoji: 🏢
|
4 |
colorFrom: gray
|
5 |
colorTo: indigo
|
app.py
CHANGED
@@ -4,41 +4,120 @@ import nbformat as nbf
|
|
4 |
from huggingface_hub import HfApi
|
5 |
from httpx import Client
|
6 |
import logging
|
7 |
-
|
|
|
|
|
8 |
|
9 |
"""
|
10 |
TODOs:
|
11 |
-
-
|
|
|
|
|
|
|
|
|
|
|
12 |
- Parametrize the commands (Move to another file)
|
13 |
-
- Let user choose the framework and get if from /compatible-libraries
|
14 |
- Use an LLM to suggest commands by column types
|
15 |
-
- Add
|
|
|
|
|
|
|
|
|
16 |
- Enable 'generate notebook' button only if dataset is available and supports library
|
|
|
17 |
"""
|
18 |
|
19 |
# Configuration
|
20 |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
|
21 |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
22 |
client = Client(headers=HEADERS)
|
|
|
23 |
|
24 |
logging.basicConfig(level=logging.INFO)
|
25 |
|
26 |
|
27 |
def get_compatible_libraries(dataset: str):
|
28 |
-
try:
|
29 |
resp = client.get(
|
30 |
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
|
31 |
)
|
32 |
resp.raise_for_status()
|
33 |
return resp.json()
|
34 |
-
except Exception as err:
|
35 |
-
logging.error(f"Failed to fetch compatible libraries: {err}")
|
36 |
-
return None
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def create_notebook_file(cell_commands, notebook_name):
|
40 |
nb = nbf.v4.new_notebook()
|
41 |
-
nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]
|
|
|
42 |
|
43 |
with open(notebook_name, "w") as f:
|
44 |
nbf.write(nb, f)
|
@@ -64,35 +143,100 @@ def push_notebook(file_path, dataset_id, token):
|
|
64 |
logging.error(f"Failed to push notebook: {err}")
|
65 |
return gr.HTML(value="Failed to push notebook", visible=True)
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def generate_notebook(dataset_id):
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
if not libraries:
|
|
|
|
|
73 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
74 |
|
75 |
pandas_library = next(
|
76 |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
|
77 |
None,
|
78 |
)
|
79 |
-
if pandas_library:
|
80 |
-
|
81 |
-
|
82 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
|
97 |
create_notebook_file(commands, notebook_name=notebook_name)
|
98 |
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
|
|
|
4 |
from huggingface_hub import HfApi
|
5 |
from httpx import Client
|
6 |
import logging
|
7 |
+
from huggingface_hub import InferenceClient
|
8 |
+
import json
|
9 |
+
import re
|
10 |
|
11 |
"""
|
12 |
TODOs:
|
13 |
+
- Refactor
|
14 |
+
- Make the notebook generation more dynamic, add loading components to do not freeze the UI
|
15 |
+
- Fix errors:
|
16 |
+
- When generating output
|
17 |
+
- When parsing output
|
18 |
+
- When pushing notebook
|
19 |
- Parametrize the commands (Move to another file)
|
|
|
20 |
- Use an LLM to suggest commands by column types
|
21 |
+
- Add target tasks to choose for the notebook:
|
22 |
+
- Exploratory data analysis
|
23 |
+
- Auto training
|
24 |
+
- RAG
|
25 |
+
- etc.
|
26 |
- Enable 'generate notebook' button only if dataset is available and supports library
|
27 |
+
- First get compatible-libraries and let user choose the library
|
28 |
"""
|
29 |
|
30 |
# Configuration
|
31 |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
|
32 |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
33 |
client = Client(headers=HEADERS)
|
34 |
+
inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
|
35 |
|
36 |
logging.basicConfig(level=logging.INFO)
|
37 |
|
38 |
|
39 |
def get_compatible_libraries(dataset: str):
|
|
|
40 |
resp = client.get(
|
41 |
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
|
42 |
)
|
43 |
resp.raise_for_status()
|
44 |
return resp.json()
|
|
|
|
|
|
|
45 |
|
46 |
+
import pandas as pd
|
47 |
+
|
48 |
+
def generate_eda_prompt(columns_info, df, first_code):
|
49 |
+
# columns_info = df.dtypes.to_dict()
|
50 |
+
sample_data = df.head(5).to_dict(orient='records')
|
51 |
+
# prompt = (
|
52 |
+
# "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. "
|
53 |
+
# "The data is provided as a pandas DataFrame with the following structure:\n\n"
|
54 |
+
# f"Columns and Data Types:\n{columns_info}\n\n"
|
55 |
+
# f"Sample Data:\n{sample_data}\n\n"
|
56 |
+
# "Please create a pandas EDA notebook that includes the following:\n"
|
57 |
+
# "1. Summary statistics for numerical columns.\n"
|
58 |
+
# "2. Distribution plots for numerical columns.\n"
|
59 |
+
# "3. Bar plots or count plots for categorical columns.\n"
|
60 |
+
# "4. Correlation matrix and heatmap for numerical columns.\n"
|
61 |
+
# "5. Any other relevant visualizations or analyses you deem appropriate.\n\n"
|
62 |
+
# "Ensure the notebook is well-organized, with explanations for each step."
|
63 |
+
# f"You can use the following code to load the dataset:\n\n{first_code}\n"
|
64 |
+
# """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n"
|
65 |
+
# ```json
|
66 |
+
# [
|
67 |
+
# {
|
68 |
+
# "cell_type": string // This refers either is a markdown or code cell type.
|
69 |
+
# "source": list of string // This is the list of text or python code.
|
70 |
+
# }
|
71 |
+
# ]
|
72 |
+
# ```
|
73 |
+
# Do not include more information than necessary, as this will be used to generate the notebook.
|
74 |
+
# """
|
75 |
+
# )
|
76 |
+
format_instructions = """
|
77 |
+
The output should be a markdown code snippet formatted in the
|
78 |
+
following schema, including the leading and trailing "```json" and "```":
|
79 |
+
|
80 |
+
```json
|
81 |
+
[
|
82 |
+
{
|
83 |
+
"cell_type": string // This refers either is a markdown or code cell type.
|
84 |
+
"source": list of string // This is the list of text or python code.
|
85 |
+
}
|
86 |
+
]
|
87 |
+
```
|
88 |
+
"""
|
89 |
+
|
90 |
+
prompt = """
|
91 |
+
You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
|
92 |
+
|
93 |
+
Columns and Data Types:
|
94 |
+
{columns_info}
|
95 |
+
|
96 |
+
Sample Data:
|
97 |
+
{sample_data}
|
98 |
+
|
99 |
+
Please create a pandas EDA notebook that includes the following:
|
100 |
+
|
101 |
+
1. Summary statistics for numerical columns.
|
102 |
+
2. Distribution plots for numerical columns.
|
103 |
+
3. Bar plots or count plots for categorical columns.
|
104 |
+
4. Correlation matrix and heatmap for numerical columns.
|
105 |
+
5. Any additional relevant visualizations or analyses you deem appropriate.
|
106 |
+
|
107 |
+
Ensure the notebook is well-organized, with explanations for each step.
|
108 |
+
|
109 |
+
It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
|
110 |
+
|
111 |
+
{first_code}
|
112 |
+
|
113 |
+
{format_instructions}
|
114 |
+
"""
|
115 |
+
return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions)
|
116 |
|
117 |
def create_notebook_file(cell_commands, notebook_name):
|
118 |
nb = nbf.v4.new_notebook()
|
119 |
+
nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands]
|
120 |
+
|
121 |
|
122 |
with open(notebook_name, "w") as f:
|
123 |
nbf.write(nb, f)
|
|
|
143 |
logging.error(f"Failed to push notebook: {err}")
|
144 |
return gr.HTML(value="Failed to push notebook", visible=True)
|
145 |
|
146 |
+
def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
|
147 |
+
resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
|
148 |
+
resp.raise_for_status()
|
149 |
+
content = resp.json()
|
150 |
+
rows = content["rows"]
|
151 |
+
rows = [row['row'] for row in rows]
|
152 |
+
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
|
153 |
+
features = content['features']
|
154 |
+
features_dict = {feature['name']: feature['type'] for feature in features}
|
155 |
+
return features_dict, first_rows_df
|
156 |
+
|
157 |
+
|
158 |
+
def content_from_output(output):
|
159 |
+
pattern = r'`json(.*?)`'
|
160 |
+
logging.info("--------> Getting data from output")
|
161 |
+
match = re.search(pattern, output, re.DOTALL)
|
162 |
+
if not match:
|
163 |
+
pattern = r'```(.*?)```'
|
164 |
+
logging.info("--------> Getting data from output, second try")
|
165 |
+
match = re.search(pattern, output, re.DOTALL)
|
166 |
+
if not match:
|
167 |
+
raise Exception("Unable to generate jupyter notebook.")
|
168 |
+
extracted_text = match.group(1)
|
169 |
+
logging.info(extracted_text)
|
170 |
+
|
171 |
+
|
172 |
+
def get_notebook_cells(prompt):
|
173 |
+
messages = [{"role": "user", "content": prompt}]
|
174 |
+
output = inference_client.chat_completion(messages=messages, max_tokens=2500)
|
175 |
+
output = (output.choices[0].message.content)
|
176 |
+
logging.info(output)
|
177 |
+
pattern = r'`json(.*?)`'
|
178 |
+
logging.info("--------> Getting data from output")
|
179 |
+
match = re.search(pattern, output, re.DOTALL)
|
180 |
+
if not match:
|
181 |
+
raise Exception("Unable to generate jupyter notebook.")
|
182 |
+
extracted_text = match.group(1)
|
183 |
+
logging.info(extracted_text)
|
184 |
+
content = json.loads(extracted_text)
|
185 |
+
logging.info(content)
|
186 |
+
return content
|
187 |
|
188 |
def generate_notebook(dataset_id):
|
189 |
+
|
190 |
+
#TODO: Load dataframe from notebook here
|
191 |
+
# generate_eda_prompt
|
192 |
+
|
193 |
+
try:
|
194 |
+
libraries = get_compatible_libraries(dataset_id)
|
195 |
+
except Exception as err:
|
196 |
+
gr.Error('Unable to retrieve dataset info from HF Hub.')
|
197 |
+
logging.error(f"Failed to fetch compatible libraries: {err}")
|
198 |
+
return None
|
199 |
|
200 |
if not libraries:
|
201 |
+
gr.Warning('Dataset not compatible with pandas library.')
|
202 |
+
logging.error(f"Dataset not compatible with pandas library")
|
203 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
204 |
|
205 |
pandas_library = next(
|
206 |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
|
207 |
None,
|
208 |
)
|
209 |
+
if not pandas_library:
|
210 |
+
gr.Warning('Dataset not compatible with pandas library.')
|
211 |
+
logging.error(f"Dataset not compatible with pandas library")
|
212 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
213 |
|
214 |
+
first_config_loading_code = pandas_library['loading_codes'][0]
|
215 |
+
first_code = first_config_loading_code['code']
|
216 |
+
|
217 |
+
first_config = first_config_loading_code['config_name']
|
218 |
+
first_split = list(first_config_loading_code['arguments']['splits'].keys())[0]
|
219 |
+
logging.info(f"First config: {first_config} - first split: {first_split}")
|
220 |
+
first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
|
221 |
+
logging.info(f"First split file: {first_file}")
|
222 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
223 |
+
features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
|
224 |
+
prompt = generate_eda_prompt(features, df, first_code)
|
225 |
+
logging.info(f"Prompt: {prompt}")
|
226 |
+
commands = get_notebook_cells(prompt)
|
227 |
+
# TODO: Generate this commands using InferenceClient
|
228 |
+
# commands = [
|
229 |
+
# "!pip install pandas",
|
230 |
+
# "import pandas as pd"
|
231 |
+
# f"df = pd.read_parquet('{first_file}')",
|
232 |
+
# "df.head()",
|
233 |
+
# f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
|
234 |
+
# "print(df.shape)",
|
235 |
+
# "df.columns",
|
236 |
+
# "df.describe()",
|
237 |
+
# "df.info()",
|
238 |
+
# # TODO: Generate more commands according to column types for EDA and then for auto training?
|
239 |
+
# ]
|
240 |
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
|
241 |
create_notebook_file(commands, notebook_name=notebook_name)
|
242 |
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
|