Spaces:

andymbryant
/

data-mapper

Runtime error

App Files Files Community

andymbryant commited on Aug 17, 2023

Commit

0e660e8

1 Parent(s): 527cf73

added test notebook for binder

Browse files

Files changed (1) hide show

test_notebook.ipynb +509 -0

test_notebook.ipynb ADDED Viewed

	@@ -0,0 +1,509 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import gradio as gr\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "import langchain\n",
+    "from langchain.output_parsers import PydanticOutputParser\n",
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.tools import PythonAstREPLTool\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.schema.output_parser import StrOutputParser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "langchain.debug = False\n",
+    "# Throwaway key with strict usage limit\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
+    "pd.set_option('display.max_columns', 20)\n",
+    "pd.set_option('display.max_rows', 20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir_path = os.path.join(os.getcwd(), 'data')\n",
+    "NUM_ROWS_TO_RETURN = 5\n",
+    "\n",
+    "table_1_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n",
+    "table_2_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n",
+    "template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transform_model = ChatOpenAI(\n",
+    "    model_name='gpt-4',\n",
+    "    temperature=0,\n",
+    ")\n",
+    "\n",
+    "natural_language_model = ChatOpenAI(\n",
+    "    model_name='gpt-4',\n",
+    "    temperature=0.1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: add validation to models, coupled with retry mechanism in chain\n",
+    "class TableMappingEntry(BaseModel):\n",
+    "    '''A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations.'''\n",
+    "    source_column_name: str = Field(..., description=\"Name of the column in the source table.\")\n",
+    "    target_column_name: str = Field(..., description=\"Name of the column in the target table, to which the source column maps.\")\n",
+    "    value_transformations: str = Field(..., description=\"Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.\")\n",
+    "    explanation: str = Field(..., description=\"One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.\")\n",
+    "\n",
+    "class TableMapping(BaseModel):\n",
+    "    '''A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table.'''\n",
+    "    table_mappings: list[TableMappingEntry] = Field(..., description=\"A list of table mappings.\")\n",
+    "    \n",
+    "analyst_prompt_str = '''\n",
+    "You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.\n",
+    "\n",
+    "Head of `source_csv`:\n",
+    "\n",
+    "{source_1_csv_str}\n",
+    "\n",
+    "Head of `target_csv`:\n",
+    "\n",
+    "{target_csv_str}\n",
+    "\n",
+    "Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.\n",
+    "\n",
+    "For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.\n",
+    "You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.\n",
+    "\n",
+    "Remember:\n",
+    "1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity. \n",
+    "\n",
+    "Example mappings:\n",
+    "- 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.\n",
+    "- 'fullname' in `source_csv` should map to both 'FirstName' and 'LastName' in `target_csv`. You must explain this transformation, as well, including the target sequencing of first and last name.\n",
+    "\n",
+    "Example transformations:\n",
+    "- If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, explain exactly how this should be transformed and the reasoning behind it.\n",
+    "- If city in `source_csv` is `New York` and city in `target_csv` is `NEW YORK` or `NYC`, explain exactly how this should be transformed and the reasoning behind it.\n",
+    "\n",
+    "Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.\n",
+    "\n",
+    "{format_instructions}\n",
+    "\n",
+    "Remember:\n",
+    "- Be concise: you are speaking to engineers, not customers.\n",
+    "- Be precise: all of these values are case sensitive. Consider casing for city names, exact prefixes for identifiers, ordering of people's names, etc.\n",
+    "- DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation\n",
+    "\n",
+    "Your response:\n",
+    "'''\n",
+    "\n",
+    "def get_data_str_from_df_for_prompt(df, use_head=True, num_rows_to_return=NUM_ROWS_TO_RETURN):\n",
+    "    data = df.head(num_rows_to_return) if use_head else df.tail(num_rows_to_return)\n",
+    "    return f'<df>\\n{data.to_markdown()}\\n</df>'\n",
+    "\n",
+    "table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)\n",
+    "analyst_prompt = ChatPromptTemplate.from_template(\n",
+    "    template=analyst_prompt_str, \n",
+    "    partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "mapping_chain = analyst_prompt | transform_model | table_mapping_parser\n",
+    "table_mapping: TableMapping = mapping_chain.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(table_1_df), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# spec writer\n",
+    "spec_writer_prompt_str = '''\n",
+    "You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.\n",
+    "Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.\n",
+    "\n",
+    "This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.\n",
+    "However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.\n",
+    "\n",
+    "Here is a stringified pydantic object that describes the mapping and the transformation steps:\n",
+    "\n",
+    "{table_mapping}\n",
+    "\n",
+    "You must translate this into clean, concise, and complete instructions for your employee.\n",
+    "\n",
+    "This document should be formatted like a technical document in plaintext. Do not include code or data.\n",
+    "\n",
+    "This document must include:\n",
+    "- Overview\n",
+    "- Input (source_df), Output (target_df)\n",
+    "- Exact column mapping\n",
+    "- Exact transformation steps for each column\n",
+    "- Precise instructions for what this script should do\n",
+    "- Script input: Pandas Dataframe named `source_df`.\n",
+    "- Script output: Pandas Dataframe named `target_df`.\n",
+    "- Do not modify the source_df. Create a new dataframe named target_df.\n",
+    "- This script should never include the source data. It should only include the transormations required to create the target_df.\n",
+    "- Return the target_df.\n",
+    "\n",
+    "You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.\n",
+    "Remember:\n",
+    "- Clean: No extra information, no formatting aside from plaintext\n",
+    "- Concise: Your employees benefit from brevity\n",
+    "- Precise: your words must be unambiguous, exact, and full represent a perfect translation of the table_mapping object.\n",
+    "\n",
+    "Your response:\n",
+    "'''\n",
+    "spec_writer_prompt = ChatPromptTemplate.from_template(spec_writer_prompt_str)\n",
+    "\n",
+    "spec_writer_chain = spec_writer_prompt | natural_language_model | StrOutputParser()\n",
+    "spec_str = spec_writer_chain.invoke({\"table_mapping\": str(table_mapping)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engineer_prompt_str = '''\n",
+    "You are a Senior Software Engineer, who specializes in writing Python code for ETL pipelines.\n",
+    "Your Product Manager has written a spec for a new transormation script. You must follow this document exactly, write python code that implements the spec, validate that code, and then return it.\n",
+    "Your output should only be python code in Markdown format, eg:\n",
+    "    ```python\n",
+    "    ....\n",
+    "    ```\"\"\"\n",
+    "Do not return any additional text / explanation. This code will be executed by a robot without human intervention.\n",
+    "\n",
+    "Here is the technical specification for your code:\n",
+    "\n",
+    "{spec_str}\n",
+    "\n",
+    "Remember: return only clean python code in markdown format. The python interpreter running this code will already have `source_df` as a local variable.\n",
+    "\n",
+    "Your must return `target_df` at the end.\n",
+    "'''\n",
+    "engineer_prompt = ChatPromptTemplate.from_template(engineer_prompt_str)\n",
+    "\n",
+    "# engineer_chain = engineer_prompt | transform_model | StrOutputParser() | PythonAstREPLTool(locals={'source_df': table_1_df}).run\n",
+    "# table_1_df_transformed = engineer_chain.invoke({\"spec_str\": spec_str})\n",
+    "engineer_chain = engineer_prompt | transform_model | StrOutputParser()\n",
+    "transform_code = engineer_chain.invoke({\"spec_str\": spec_str})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7874\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7874/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def show_mapping(file):\n",
+    "    # TODO: add code\n",
+    "    return pd.DataFrame(table_mapping.dict()['table_mappings'])\n",
+    "demo = gr.Interface(fn=show_mapping, inputs=[\"file\"], outputs='dataframe')\n",
+    "demo.launch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7885\n",
+      "\n",
+      "Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7885/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def _sanitize_python_output(text: str):\n",
+    "    _, after = text.split(\"```python\")\n",
+    "    return after.split(\"```\")[0]\n",
+    "\n",
+    "def show_code(button):\n",
+    "    # TODO: add code\n",
+    "    return _sanitize_python_output(transform_code)\n",
+    "check_mapping_text = 'How does that mapping look? \\n\\nFeel free to update it: your changes will be incorporated! \\n\\nWhen you are ready, click the Submit below, and the mapping code will be generated for your approval.'\n",
+    "demo = gr.Interface(fn=show_code, inputs=[gr.Textbox(value=check_mapping_text, interactive=False)], outputs=[gr.Code(language=\"python\")])\n",
+    "demo.launch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/4236222443.py:4: GradioDeprecationWarning: `layout` parameter is deprecated, and it has no effect\n",
+      "  demo = gr.Interface(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7892\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7892/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def get_transformed_table(button):\n",
+    "    return template_df, PythonAstREPLTool(locals={'source_df': table_1_df}).run(transform_code)\n",
+    "check_mapping_text = 'How does that code look? \\n\\nWhen you are ready, click the Submit button and the transformed source file will be transformed.'\n",
+    "demo = gr.Interface(\n",
+    "    fn=get_transformed_table,\n",
+    "    inputs=[gr.Textbox(value=check_mapping_text, interactive=False)],\n",
+    "    outputs=[gr.Dataframe(label='Template Table (target)'), gr.Dataframe(label='Table 1 (transformed)')],\n",
+    "    layout=\"column\",\n",
+    "    examples=[[1]],\n",
+    ")\n",
+    "demo.launch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:18: GradioDeprecationWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n",
+      "  gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:18: GradioDeprecationWarning: `optional` parameter is deprecated, and it has no effect\n",
+      "  gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:18: GradioDeprecationWarning: `keep_filename` parameter is deprecated, and it has no effect\n",
+      "  gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:19: GradioDeprecationWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n",
+      "  gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:19: GradioDeprecationWarning: `optional` parameter is deprecated, and it has no effect\n",
+      "  gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:19: GradioDeprecationWarning: `keep_filename` parameter is deprecated, and it has no effect\n",
+      "  gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
+      "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/utils.py:841: UserWarning: Expected 1 arguments for function <function generate_code at 0x12cb559d0>, received 0.\n",
+      "  warnings.warn(\n",
+      "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/utils.py:845: UserWarning: Expected at least 1 arguments for function <function generate_code at 0x12cb559d0>, received 0.\n",
+      "  warnings.warn(\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:39: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'trigger': 'transform_source'}\n",
+      "  gr.Button(value=\"Transform Source\", variant=\"primary\", trigger=\"transform_source\")\n",
+      "/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_94012/2180252060.py:40: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'trigger': 'save_code'}\n",
+      "  gr.Button(value=\"Save Code\", variant=\"secondary\", trigger=\"save_code\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7934\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7934/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def _sanitize_python_output(text: str):\n",
+    "    _, after = text.split(\"```python\")\n",
+    "    return after.split(\"```\")[0]\n",
+    "\n",
+    "def do_stuff(val):\n",
+    "    print(val)\n",
+    "\n",
+    "def generate_code(val):\n",
+    "    return '# check this out'\n",
+    "\n",
+    "def save_csv_file(df, filename):\n",
+    "    df.to_csv(os.path.join(data_dir_path, 'output', filename) + '.csv')\n",
+    "\n",
+    "with gr.Blocks() as demo:\n",
+    "    with gr.Column():\n",
+    "        gr.Markdown(\"## To begin, upload a Template CSV and a Source CSV file.\")\n",
+    "        with gr.Row():\n",
+    "            gr.inputs.File(label=\"Template\", type=\"file\", file_count='single')\n",
+    "            gr.inputs.File(label=\"Source\", type=\"file\", file_count='single')\n",
+    "\n",
+    "    with gr.Column():\n",
+    "        gr.Markdown(\"## Mapping from Source to Template\")\n",
+    "        with gr.Row():\n",
+    "            table_mapping_df = pd.DataFrame(table_mapping.dict()['table_mappings'])\n",
+    "            gr.DataFrame(value=table_mapping_df)\n",
+    "            save_mapping_btn = gr.Button(value=\"Save Mapping\", variant=\"secondary\")\n",
+    "            save_mapping_btn.click(fn=lambda : save_csv_file(table_mapping_df, 'table_mapping'))\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        test = gr.Markdown()\n",
+    "        generate_code_btn = gr.Button(value=\"Generate Code from Mapping\", variant=\"primary\")\n",
+    "        generate_code_btn.click(fn=generate_code, outputs=test)\n",
+    "\n",
+    "    with gr.Column():\n",
+    "        gr.Markdown(\"## Here is the code that will be used to transform the source file into the template schema:\")\n",
+    "        gr.Code(language=\"python\", value=_sanitize_python_output(transform_code))\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        gr.Button(value=\"Transform Source\", variant=\"primary\", trigger=\"transform_source\")\n",
+    "        gr.Button(value=\"Save Code\", variant=\"secondary\", trigger=\"save_code\")\n",
+    "    \n",
+    "    with gr.Row():\n",
+    "        with gr.Column():\n",
+    "            gr.Dataframe(label='Target (template)', type='pandas', value=template_df)\n",
+    "        with gr.Column():\n",
+    "            gr.Dataframe(label='Source (transformed)', type='pandas', value=PythonAstREPLTool(locals={'source_df': table_1_df}).run(transform_code))\n",
+    "\n",
+    "demo.launch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}