andymbryant commited on
Commit
0e660e8
·
1 Parent(s): 527cf73

added test notebook for binder

Browse files
Files changed (1) hide show
  1. test_notebook.ipynb +509 -0
test_notebook.ipynb ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import pandas as pd\n",
11
+ "import gradio as gr\n",
12
+ "from pydantic import BaseModel, Field\n",
13
+ "\n",
14
+ "import langchain\n",
15
+ "from langchain.output_parsers import PydanticOutputParser\n",
16
+ "from langchain.prompts import ChatPromptTemplate\n",
17
+ "from langchain.prompts import ChatPromptTemplate\n",
18
+ "from langchain.tools import PythonAstREPLTool\n",
19
+ "from langchain.chat_models import ChatOpenAI\n",
20
+ "from langchain.schema.output_parser import StrOutputParser"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 8,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "langchain.debug = False\n",
30
+ "# Throwaway key with strict usage limit\n",
31
+ "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
32
+ "pd.set_option('display.max_columns', 20)\n",
33
+ "pd.set_option('display.max_rows', 20)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 9,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "data_dir_path = os.path.join(os.getcwd(), 'data')\n",
43
+ "NUM_ROWS_TO_RETURN = 5\n",
44
+ "\n",
45
+ "table_1_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n",
46
+ "table_2_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n",
47
+ "template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 10,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "transform_model = ChatOpenAI(\n",
57
+ " model_name='gpt-4',\n",
58
+ " temperature=0,\n",
59
+ ")\n",
60
+ "\n",
61
+ "natural_language_model = ChatOpenAI(\n",
62
+ " model_name='gpt-4',\n",
63
+ " temperature=0.1,\n",
64
+ ")"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 11,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# TODO: add validation to models, coupled with retry mechanism in chain\n",
74
+ "class TableMappingEntry(BaseModel):\n",
75
+ " '''A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations.'''\n",
76
+ " source_column_name: str = Field(..., description=\"Name of the column in the source table.\")\n",
77
+ " target_column_name: str = Field(..., description=\"Name of the column in the target table, to which the source column maps.\")\n",
78
+ " value_transformations: str = Field(..., description=\"Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.\")\n",
79
+ " explanation: str = Field(..., description=\"One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.\")\n",
80
+ "\n",
81
+ "class TableMapping(BaseModel):\n",
82
+ " '''A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table.'''\n",
83
+ " table_mappings: list[TableMappingEntry] = Field(..., description=\"A list of table mappings.\")\n",
84
+ " \n",
85
+ "analyst_prompt_str = '''\n",
86
+ "You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.\n",
87
+ "\n",
88
+ "Head of `source_csv`:\n",
89
+ "\n",
90
+ "{source_1_csv_str}\n",
91
+ "\n",
92
+ "Head of `target_csv`:\n",
93
+ "\n",
94
+ "{target_csv_str}\n",
95
+ "\n",
96
+ "Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.\n",
97
+ "\n",
98
+ "For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.\n",
99
+ "You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.\n",
100
+ "\n",
101
+ "Remember:\n",
102
+ "1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity. \n",
103
+ "\n",
104
+ "Example mappings:\n",
105
+ "- 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.\n",
106
+ "- 'fullname' in `source_csv` should map to both 'FirstName' and 'LastName' in `target_csv`. You must explain this transformation, as well, including the target sequencing of first and last name.\n",
107
+ "\n",
108
+ "Example transformations:\n",
109
+ "- If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, explain exactly how this should be transformed and the reasoning behind it.\n",
110
+ "- If city in `source_csv` is `New York` and city in `target_csv` is `NEW YORK` or `NYC`, explain exactly how this should be transformed and the reasoning behind it.\n",
111
+ "\n",
112
+ "Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.\n",
113
+ "\n",
114
+ "{format_instructions}\n",
115
+ "\n",
116
+ "Remember:\n",
117
+ "- Be concise: you are speaking to engineers, not customers.\n",
118
+ "- Be precise: all of these values are case sensitive. Consider casing for city names, exact prefixes for identifiers, ordering of people's names, etc.\n",
119
+ "- DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation\n",
120
+ "\n",
121
+ "Your response:\n",
122
+ "'''\n",
123
+ "\n",
124
+ "def get_data_str_from_df_for_prompt(df, use_head=True, num_rows_to_return=NUM_ROWS_TO_RETURN):\n",
125
+ " data = df.head(num_rows_to_return) if use_head else df.tail(num_rows_to_return)\n",
126
+ " return f'<df>\\n{data.to_markdown()}\\n</df>'\n",
127
+ "\n",
128
+ "table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)\n",
129
+ "analyst_prompt = ChatPromptTemplate.from_template(\n",
130
+ " template=analyst_prompt_str, \n",
131
+ " partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},\n",
132
+ ")\n",
133
+ "\n",
134
+ "mapping_chain = analyst_prompt | transform_model | table_mapping_parser\n",
135
+ "table_mapping: TableMapping = mapping_chain.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(table_1_df), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df)})"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 12,
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "# spec writer\n",
145
+ "spec_writer_prompt_str = '''\n",
146
+ "You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.\n",
147
+ "Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.\n",
148
+ "\n",
149
+ "This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.\n",
150
+ "However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.\n",
151
+ "\n",
152
+ "Here is a stringified pydantic object that describes the mapping and the transformation steps:\n",
153
+ "\n",
154
+ "{table_mapping}\n",
155
+ "\n",
156
+ "You must translate this into clean, concise, and complete instructions for your employee.\n",
157
+ "\n",
158
+ "This document should be formatted like a technical document in plaintext. Do not include code or data.\n",
159
+ "\n",
160
+ "This document must include:\n",
161
+ "- Overview\n",
162
+ "- Input (source_df), Output (target_df)\n",
163
+ "- Exact column mapping\n",
164
+ "- Exact transformation steps for each column\n",
165
+ "- Precise instructions for what this script should do\n",
166
+ "- Script input: Pandas Dataframe named `source_df`.\n",
167
+ "- Script output: Pandas Dataframe named `target_df`.\n",
168
+ "- Do not modify the source_df. Create a new dataframe named target_df.\n",
169
+ "- This script should never include the source data. It should only include the transormations required to create the target_df.\n",
170
+ "- Return the target_df.\n",
171
+ "\n",
172
+ "You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.\n",
173
+ "Remember:\n",
174
+ "- Clean: No extra information, no formatting aside from plaintext\n",
175
+ "- Concise: Your employees benefit from brevity\n",
176
+ "- Precise: your words must be unambiguous, exact, and full represent a perfect translation of the table_mapping object.\n",
177
+ "\n",
178
+ "Your response:\n",
179
+ "'''\n",
180
+ "spec_writer_prompt = ChatPromptTemplate.from_template(spec_writer_prompt_str)\n",
181
+ "\n",
182
+ "spec_writer_chain = spec_writer_prompt | natural_language_model | StrOutputParser()\n",
183
+ "spec_str = spec_writer_chain.invoke({\"table_mapping\": str(table_mapping)})"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 19,
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "engineer_prompt_str = '''\n",
193
+ "You are a Senior Software Engineer, who specializes in writing Python code for ETL pipelines.\n",
194
+ "Your Product Manager has written a spec for a new transormation script. You must follow this document exactly, write python code that implements the spec, validate that code, and then return it.\n",
195
+ "Your output should only be python code in Markdown format, eg:\n",
196
+ " ```python\n",
197
+ " ....\n",
198
+ " ```\"\"\"\n",
199
+ "Do not return any additional text / explanation. This code will be executed by a robot without human intervention.\n",
200
+ "\n",
201
+ "Here is the technical specification for your code:\n",
202
+ "\n",
203
+ "{spec_str}\n",
204
+ "\n",
205
+ "Remember: return only clean python code in markdown format. The python interpreter running this code will already have `source_df` as a local variable.\n",
206
+ "\n",
207
+ "Your must return `target_df` at the end.\n",
208
+ "'''\n",
209
+ "engineer_prompt = ChatPromptTemplate.from_template(engineer_prompt_str)\n",
210
+ "\n",
211
+ "# engineer_chain = engineer_prompt | transform_model | StrOutputParser() | PythonAstREPLTool(locals={'source_df': table_1_df}).run\n",
212
+ "# table_1_df_transformed = engineer_chain.invoke({\"spec_str\": spec_str})\n",
213
+ "engineer_chain = engineer_prompt | transform_model | StrOutputParser()\n",
214
+ "transform_code = engineer_chain.invoke({\"spec_str\": spec_str})"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 17,
220
+ "metadata": {},
221
+ "outputs": [
222
+ {
223
+ "name": "stdout",
224
+ "output_type": "stream",
225
+ "text": [
226
+ "Running on local URL: http://127.0.0.1:7874\n",
227
+ "\n",
228
+ "To create a public link, set `share=True` in `launch()`.\n"
229
+ ]
230
+ },
231
+ {
232
+ "data": {
233
+ "text/html": [
234
+ "<div><iframe src=\"http://127.0.0.1:7874/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
235
+ ],
236
+ "text/plain": [
237
+ "<IPython.core.display.HTML object>"
238
+ ]
239
+ },
240
+ "metadata": {},
241
+ "output_type": "display_data"
242
+ },
243
+ {
244
+ "data": {
245
+ "text/plain": []
246
+ },
247
+ "execution_count": 17,
248
+ "metadata": {},
249
+ "output_type": "execute_result"
250
+ }
251
+ ],
252
+ "source": [
253
+ "def show_mapping(file):\n",
254
+ " # TODO: add code\n",
255
+ " return pd.DataFrame(table_mapping.dict()['table_mappings'])\n",
256
+ "demo = gr.Interface(fn=show_mapping, inputs=[\"file\"], outputs='dataframe')\n",
257
+ "demo.launch()"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": 34,
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "name": "stdout",
267
+ "output_type": "stream",
268
+ "text": [
269
+ "Running on local URL: http://127.0.0.1:7885\n",
270
+ "\n",
271
+ "Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB\n",
272
+ "\n",
273
+ "To create a public link, set `share=True` in `launch()`.\n"
274
+ ]
275
+ },
276
+ {
277
+ "data": {
278
+ "text/html": [
279
+ "<div><iframe src=\"http://127.0.0.1:7885/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
280
+ ],
281
+ "text/plain": [
282
+ "<IPython.core.display.HTML object>"
283
+ ]
284
+ },
285
+ "metadata": {},
286
+ "output_type": "display_data"
287
+ },
288
+ {
289
+ "data": {
290
+ "text/plain": []
291
+ },
292
+ "execution_count": 34,
293
+ "metadata": {},
294
+ "output_type": "execute_result"
295
+ }
296
+ ],
297
+ "source": [
298
+ "def _sanitize_python_output(text: str):\n",
299
+ " _, after = text.split(\"```python\")\n",
300
+ " return after.split(\"```\")[0]\n",
301
+ "\n",
302
+ "def show_code(button):\n",
303
+ " # TODO: add code\n",
304
+ " return _sanitize_python_output(transform_code)\n",
305
+ "check_mapping_text = 'How does that mapping look? \\n\\nFeel free to update it: your changes will be incorporated! \\n\\nWhen you are ready, click the Submit below, and the mapping code will be generated for your approval.'\n",
306
+ "demo = gr.Interface(fn=show_code, inputs=[gr.Textbox(value=check_mapping_text, interactive=False)], outputs=[gr.Code(language=\"python\")])\n",
307
+ "demo.launch()"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 41,
313
+ "metadata": {},
314
+ "outputs": [
315
+ {
316
+ "name": "stderr",
317
+ "output_type": "stream",
318
+ "text": [
319
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/4236222443.py:4: GradioDeprecationWarning: `layout` parameter is deprecated, and it has no effect\n",
320
+ " demo = gr.Interface(\n"
321
+ ]
322
+ },
323
+ {
324
+ "name": "stdout",
325
+ "output_type": "stream",
326
+ "text": [
327
+ "Running on local URL: http://127.0.0.1:7892\n",
328
+ "\n",
329
+ "To create a public link, set `share=True` in `launch()`.\n"
330
+ ]
331
+ },
332
+ {
333
+ "data": {
334
+ "text/html": [
335
+ "<div><iframe src=\"http://127.0.0.1:7892/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
336
+ ],
337
+ "text/plain": [
338
+ "<IPython.core.display.HTML object>"
339
+ ]
340
+ },
341
+ "metadata": {},
342
+ "output_type": "display_data"
343
+ },
344
+ {
345
+ "data": {
346
+ "text/plain": []
347
+ },
348
+ "execution_count": 41,
349
+ "metadata": {},
350
+ "output_type": "execute_result"
351
+ }
352
+ ],
353
+ "source": [
354
+ "def get_transformed_table(button):\n",
355
+ " return template_df, PythonAstREPLTool(locals={'source_df': table_1_df}).run(transform_code)\n",
356
+ "check_mapping_text = 'How does that code look? \\n\\nWhen you are ready, click the Submit button and the transformed source file will be transformed.'\n",
357
+ "demo = gr.Interface(\n",
358
+ " fn=get_transformed_table,\n",
359
+ " inputs=[gr.Textbox(value=check_mapping_text, interactive=False)],\n",
360
+ " outputs=[gr.Dataframe(label='Template Table (target)'), gr.Dataframe(label='Table 1 (transformed)')],\n",
361
+ " layout=\"column\",\n",
362
+ " examples=[[1]],\n",
363
+ ")\n",
364
+ "demo.launch()"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": 89,
370
+ "metadata": {},
371
+ "outputs": [
372
+ {
373
+ "name": "stderr",
374
+ "output_type": "stream",
375
+ "text": [
376
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:18: GradioDeprecationWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n",
377
+ " gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
378
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:18: GradioDeprecationWarning: `optional` parameter is deprecated, and it has no effect\n",
379
+ " gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
380
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:18: GradioDeprecationWarning: `keep_filename` parameter is deprecated, and it has no effect\n",
381
+ " gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
382
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:19: GradioDeprecationWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n",
383
+ " gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
384
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:19: GradioDeprecationWarning: `optional` parameter is deprecated, and it has no effect\n",
385
+ " gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
386
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:19: GradioDeprecationWarning: `keep_filename` parameter is deprecated, and it has no effect\n",
387
+ " gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
388
+ "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/utils.py:841: UserWarning: Expected 1 arguments for function <function generate_code at 0x12cb559d0>, received 0.\n",
389
+ " warnings.warn(\n",
390
+ "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/utils.py:845: UserWarning: Expected at least 1 arguments for function <function generate_code at 0x12cb559d0>, received 0.\n",
391
+ " warnings.warn(\n",
392
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:39: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'trigger': 'transform_source'}\n",
393
+ " gr.Button(value=\"Transform Source\", variant=\"primary\", trigger=\"transform_source\")\n",
394
+ "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:40: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'trigger': 'save_code'}\n",
395
+ " gr.Button(value=\"Save Code\", variant=\"secondary\", trigger=\"save_code\")\n"
396
+ ]
397
+ },
398
+ {
399
+ "name": "stdout",
400
+ "output_type": "stream",
401
+ "text": [
402
+ "Running on local URL: http://127.0.0.1:7934\n",
403
+ "\n",
404
+ "To create a public link, set `share=True` in `launch()`.\n"
405
+ ]
406
+ },
407
+ {
408
+ "data": {
409
+ "text/html": [
410
+ "<div><iframe src=\"http://127.0.0.1:7934/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
411
+ ],
412
+ "text/plain": [
413
+ "<IPython.core.display.HTML object>"
414
+ ]
415
+ },
416
+ "metadata": {},
417
+ "output_type": "display_data"
418
+ },
419
+ {
420
+ "data": {
421
+ "text/plain": []
422
+ },
423
+ "execution_count": 89,
424
+ "metadata": {},
425
+ "output_type": "execute_result"
426
+ }
427
+ ],
428
+ "source": [
429
+ "def _sanitize_python_output(text: str):\n",
430
+ " _, after = text.split(\"```python\")\n",
431
+ " return after.split(\"```\")[0]\n",
432
+ "\n",
433
+ "def do_stuff(val):\n",
434
+ " print(val)\n",
435
+ "\n",
436
+ "def generate_code(val):\n",
437
+ " return '# check this out'\n",
438
+ "\n",
439
+ "def save_csv_file(df, filename):\n",
440
+ " df.to_csv(os.path.join(data_dir_path, 'output', filename) + '.csv')\n",
441
+ "\n",
442
+ "with gr.Blocks() as demo:\n",
443
+ " with gr.Column():\n",
444
+ " gr.Markdown(\"## To begin, upload a Template CSV and a Source CSV file.\")\n",
445
+ " with gr.Row():\n",
446
+ " gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
447
+ " gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
448
+ "\n",
449
+ " with gr.Column():\n",
450
+ " gr.Markdown(\"## Mapping from Source to Template\")\n",
451
+ " with gr.Row():\n",
452
+ " table_mapping_df = pd.DataFrame(table_mapping.dict()['table_mappings'])\n",
453
+ " gr.DataFrame(value=table_mapping_df)\n",
454
+ " save_mapping_btn = gr.Button(value=\"Save Mapping\", variant=\"secondary\")\n",
455
+ " save_mapping_btn.click(fn=lambda : save_csv_file(table_mapping_df, 'table_mapping'))\n",
456
+ "\n",
457
+ " with gr.Row():\n",
458
+ " test = gr.Markdown()\n",
459
+ " generate_code_btn = gr.Button(value=\"Generate Code from Mapping\", variant=\"primary\")\n",
460
+ " generate_code_btn.click(fn=generate_code, outputs=test)\n",
461
+ "\n",
462
+ " with gr.Column():\n",
463
+ " gr.Markdown(\"## Here is the code that will be used to transform the source file into the template schema:\")\n",
464
+ " gr.Code(language=\"python\", value=_sanitize_python_output(transform_code))\n",
465
+ "\n",
466
+ " with gr.Row():\n",
467
+ " gr.Button(value=\"Transform Source\", variant=\"primary\", trigger=\"transform_source\")\n",
468
+ " gr.Button(value=\"Save Code\", variant=\"secondary\", trigger=\"save_code\")\n",
469
+ " \n",
470
+ " with gr.Row():\n",
471
+ " with gr.Column():\n",
472
+ " gr.Dataframe(label='Target (template)', type='pandas', value=template_df)\n",
473
+ " with gr.Column():\n",
474
+ " gr.Dataframe(label='Source (transformed)', type='pandas', value=PythonAstREPLTool(locals={'source_df': table_1_df}).run(transform_code))\n",
475
+ "\n",
476
+ "demo.launch()"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": null,
482
+ "metadata": {},
483
+ "outputs": [],
484
+ "source": []
485
+ }
486
+ ],
487
+ "metadata": {
488
+ "kernelspec": {
489
+ "display_name": "venv",
490
+ "language": "python",
491
+ "name": "python3"
492
+ },
493
+ "language_info": {
494
+ "codemirror_mode": {
495
+ "name": "ipython",
496
+ "version": 3
497
+ },
498
+ "file_extension": ".py",
499
+ "mimetype": "text/x-python",
500
+ "name": "python",
501
+ "nbconvert_exporter": "python",
502
+ "pygments_lexer": "ipython3",
503
+ "version": "3.9.6"
504
+ },
505
+ "orig_nbformat": 4
506
+ },
507
+ "nbformat": 4,
508
+ "nbformat_minor": 2
509
+ }