Spaces:

andymbryant
/

data-mapper

Runtime error

App Files Files Community

andymbryant commited on Aug 18, 2023

Commit

88e8643

1 Parent(s): 8183509

added example data

Browse files

Files changed (9) hide show

.gitignore +3 -0
app.py +21 -3
src/core.py +5 -5
src/data/actual/table_A.csv +11 -0
src/data/actual/table_B.csv +11 -0
src/data/actual/template.csv +11 -0
src/prompt.py +3 -3
src/types.py +1 -1
wandb/run-20230818_001703-qce5cvts/files/requirements.txt +125 -0

.gitignore CHANGED Viewed

@@ -2,3 +2,6 @@ venv/
 __pycache__/
 *.pyc
 .env

 __pycache__/
 *.pyc
 .env
+*.json
+wandb/
+*.log

app.py CHANGED Viewed

@@ -8,10 +8,15 @@ from src.core import (
 MAX_ROWS = 10
 def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
     return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
 # TODO: use tempfile
 def export_csv(df, filename):
@@ -20,12 +25,13 @@ def export_csv(df, filename):
 # TODO: use tempfile
-def export_text(val, filename):
     with open(filename, "w") as f:
-        f.write(val)
     return gr.File.update(value=filename, visible=True)
 with gr.Blocks() as demo:
     gr.Markdown(
         "# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
@@ -42,21 +48,33 @@ with gr.Blocks() as demo:
                 label="Upload Template File",
                 file_types=[".csv"],
                 file_count="single",
             )
             template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
             upload_template_btn.upload(
                 fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
             )
         with gr.Column():
             upload_source_button = gr.UploadButton(
                 label="Upload Source File",
                 file_types=[".csv"],
                 file_count="single",
             )
             source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
             upload_source_button.upload(
                 fn=process_csv_text, inputs=upload_source_button, outputs=source_df
             )
     # STEP 2
     generate_step_markdown(

 MAX_ROWS = 10
+import pandas as pd
 def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
     return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
+example_df = pd.read_csv('./src/data/synthetic/legal_entries_a.csv')
+def load_example_template(template_df, example_df):
+    return template_df.update(example_df)
 # TODO: use tempfile
 def export_csv(df, filename):
 # TODO: use tempfile
+def export_text(content, filename):
     with open(filename, "w") as f:
+        f.write(content)
     return gr.File.update(value=filename, visible=True)
 with gr.Blocks() as demo:
     gr.Markdown(
         "# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
                 label="Upload Template File",
                 file_types=[".csv"],
                 file_count="single",
+                variant="primary",
+            )
+            load_template_btn = gr.Button(
+                value="Load Example Template File",
+                variant="secondary",
             )
             template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
             upload_template_btn.upload(
                 fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
             )
+            load_template_btn.click(lambda _: pd.read_csv('./src/data/actual/template.csv'), upload_template_btn, template_df)
         with gr.Column():
             upload_source_button = gr.UploadButton(
                 label="Upload Source File",
                 file_types=[".csv"],
                 file_count="single",
+                variant="primary",
+            )
+            load_source_button = gr.Button(
+                value="Load Example Source File",
+                variant="secondary",
             )
             source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
             upload_source_button.upload(
                 fn=process_csv_text, inputs=upload_source_button, outputs=source_df
             )
+            load_source_button.click(lambda _: pd.read_csv('./src/data/actual/table_A.csv'), upload_source_button, source_df)
     # STEP 2
     generate_step_markdown(

src/core.py CHANGED Viewed

@@ -76,12 +76,12 @@ def generate_mapping_code(table_mapping_df) -> str:
     return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
-def process_csv_text(temp_file):
-    """Process a CSV file into a dataframe, either from a string or a file."""
-    if isinstance(temp_file, str):
-        df = pd.read_csv(io.StringIO(temp_file))
     else:
-        df = pd.read_csv(temp_file.name)
     return df

     return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
+def process_csv_text(value):
+    """Process a CSV file into a dataframe, either from a string path or a file."""
+    if isinstance(value, str):
+        df = pd.read_csv(value)
     else:
+        df = pd.read_csv(value.name)
     return df

src/data/actual/table_A.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Date_of_Policy,FullName,Insurance_Plan,Policy_No,Monthly_Premium,Department,JobTitle,Policy_Start,Full_Name,Insurance_Type,Policy_Num,Monthly_Cost
+05/01/2023,John Doe,Gold Plan,AB-12345,150.00,IT,Software Engineer,05/01/2023,John Doe,Gold,AB-12345,150.00
+05/02/2023,Jane Smith,Silver Plan,CD-67890,100.00,HR,HR Manager,05/02/2023,Jane Smith,Silver,CD-67890,100.00
+05/03/2023,Michael Brown,Bronze Plan,EF-10111,50.00,Marketing,Marketing Coordinator,05/03/2023,Michael Brown,Bronze,EF-10111,50.00
+05/04/2023,Alice Johnson,Gold Plan,GH-12121,150.00,Finance,Financial Analyst,05/04/2023,Alice Johnson,Gold,GH-12121,150.00
+05/05/2023,Bob Wilson,Silver Plan,IJ-13131,100.00,Sales,Sales Executive,05/05/2023,Bob Wilson,Silver,IJ-13131,100.00
+05/06/2023,Carol Martinez,Bronze Plan,KL-14141,50.00,Operations,Operations Manager,05/06/2023,Carol Martinez,Bronze,KL-14141,50.00
+05/07/2023,David Anderson,Gold Plan,MN-15151,150.00,Legal,Attorney,05/07/2023,David Anderson,Gold,MN-15151,150.00
+05/08/2023,Eva Thomas,Silver Plan,OP-16161,100.00,Product,Product Manager,05/08/2023,Eva Thomas,Silver,OP-16161,100.00
+05/09/2023,Frank Jackson,Bronze Plan,QR-17171,50.00,Engineering,Engineer,05/09/2023,Frank Jackson,Bronze,QR-17171,50.00
+05/10/2023,Grace White,Gold Plan,ST-18181,150.00,Design,Graphic Designer,05/10/2023,Grace White,Gold,ST-18181,150.00

src/data/actual/table_B.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+PolicyDate,Name,PlanType,Policy_ID,PremiumAmount,Hobby,MaritalStatus,StartDate,Employee_Name,Plan_Name,PolicyID,Cost
+2023-05-01,Doe, John,GoldPackage,AB12345,Photography,Married,2023-05-01,John Doe,Gold,AB12345,150
+2023-05-02,Smith, Jane,SilverPackage,CD67890,Reading,Single,2023-05-02,Jane Smith,Silver,CD67890,100
+2023-05-03,Brown, Michael,BronzePackage,EF10111,Traveling,Divorced,2023-05-03,Michael Brown,Bronze,EF10111,50
+2023-05-04,Johnson, Alice,GoldPackage,GH12121,Cooking,Married,2023-05-04,Alice Johnson,Gold,GH12121,150
+2023-05-05,Wilson, Bob,SilverPackage,IJ13131,Hiking,Single,2023-05-05,Bob Wilson,Silver,IJ13131,100
+2023-05-06,Martinez, Carol,BronzePackage,KL14141,Swimming,Divorced,2023-05-06,Carol Martinez,Bronze,KL14141,50
+2023-05-07,Anderson, David,GoldPackage,MN15151,Gaming,Married,2023-05-07,David Anderson,Gold,MN15151,150
+2023-05-08,Thomas, Eva,SilverPackage,OP16161,Painting,Single,2023-05-08,Eva Thomas,Silver,OP16161,100
+2023-05-09,Jackson, Frank,BronzePackage,QR17171,Writing,Divorced,2023-05-09,Frank Jackson,Bronze,QR17171,50
+2023-05-10,White, Grace,GoldPackage,ST18181,Dancing,Married,2023-05-10,Grace White,Gold,ST18181,150

src/data/actual/template.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Date,EmployeeName,Plan,PolicyNumber,Premium
+01-05-2023,John Doe,Gold,AB12345,150
+02-05-2023,Jane Smith,Silver,CD67890,100
+03-05-2023,Michael Brown,Bronze,EF10111,50
+04-05-2023,Alice Johnson,Gold,GH12121,150
+05-05-2023,Bob Wilson,Silver,IJ13131,100
+06-05-2023,Carol Martinez,Bronze,KL14141,50
+07-05-2023,David Anderson,Gold,MN15151,150
+08-05-2023,Eva Thomas,Silver,OP16161,100
+09-05-2023,Frank Jackson,Bronze,QR17171,50
+10-05-2023,Grace White,Gold,ST18181,150

src/prompt.py CHANGED Viewed

@@ -12,10 +12,10 @@ Head of `target_csv`:
 Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
 For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
-You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.
-Remember:
-1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity.
 Example mappings:
 - 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.

 Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
 For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
+You should consider the semantic meaning of the columns, not just the character similarity or positioning.
+If there is no corresponding column in the `target_csv`, you should write 'DROP_COLUMN' in the `target_csv` column.
+You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.
 Example mappings:
 - 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.

src/types.py CHANGED Viewed

@@ -9,7 +9,7 @@ class TableMappingEntry(BaseModel):
     )
     target_column_name: str = Field(
         ...,
-        description="Name of the column in the target table, to which the source column maps.",
     )
     value_transformations: str = Field(
         ...,

     )
     target_column_name: str = Field(
         ...,
+        description="Name of the column in the target table, to which the source column maps. If there is no target_column, write 'DROP_COLUMN'.",
     )
     value_transformations: str = Field(
         ...,

wandb/run-20230818_001703-qce5cvts/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,125 @@

+aiofiles==23.2.1
+aiohttp==3.8.5
+aiosignal==1.3.1
+altair==5.0.1
+anyio==3.7.1
+appdirs==1.4.4
+appnope==0.1.3
+asttokens==2.2.1
+async-timeout==4.0.3
+attrs==23.1.0
+backcall==0.2.0
+black==23.7.0
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+comm==0.1.4
+contourpy==1.1.0
+cycler==0.11.0
+dataclasses-json==0.5.14
+debugpy==1.6.7.post1
+decorator==5.1.1
+docker-pycreds==0.4.0
+exceptiongroup==1.1.3
+executing==1.2.0
+fastapi==0.101.1
+ffmpy==0.3.1
+filelock==3.12.2
+fonttools==4.42.0
+frozenlist==1.4.0
+fsspec==2023.6.0
+gitdb==4.0.10
+gitpython==3.1.32
+gradio-client==0.4.0
+gradio==3.40.1
+greenlet==2.0.2
+h11==0.14.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.16.4
+idna==3.4
+importlib-metadata==6.8.0
+importlib-resources==6.0.1
+ipykernel==6.25.1
+ipython==8.14.0
+jedi==0.19.0
+jinja2==3.1.2
+jsonschema-specifications==2023.7.1
+jsonschema==4.19.0
+jupyter-client==8.3.0
+jupyter-core==5.3.1
+kiwisolver==1.4.4
+langchain==0.0.264
+langsmith==0.0.22
+linkify-it-py==2.0.2
+markdown-it-py==2.2.0
+markupsafe==2.1.3
+marshmallow==3.20.1
+matplotlib-inline==0.1.6
+matplotlib==3.7.2
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+multidict==6.0.4
+mypy-extensions==1.0.0
+nest-asyncio==1.5.7
+numexpr==2.8.5
+numpy==1.25.2
+openai==0.27.8
+openapi-schema-pydantic==1.2.4
+orjson==3.9.4
+packaging==23.1
+pandas==2.0.3
+parso==0.8.3
+pathspec==0.11.2
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==10.0.0
+pip==23.2.1
+platformdirs==3.10.0
+prompt-toolkit==3.0.39
+protobuf==4.24.0
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pydantic==1.10.12
+pydub==0.25.1
+pygments==2.16.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-multipart==0.0.6
+pytz==2023.3
+pyyaml==6.0.1
+pyzmq==25.1.1
+referencing==0.30.2
+requests==2.31.0
+rpds-py==0.9.2
+semantic-version==2.10.0
+sentry-sdk==1.29.2
+setproctitle==1.3.2
+setuptools==56.0.0
+six==1.16.0
+smmap==5.0.0
+sniffio==1.3.0
+sqlalchemy==2.0.19
+stack-data==0.6.2
+starlette==0.27.0
+tabulate==0.9.0
+tenacity==8.2.3
+tomli==2.0.1
+toolz==0.12.0
+tornado==6.3.3
+tqdm==4.66.1
+traitlets==5.9.0
+typing-extensions==4.7.1
+typing-inspect==0.9.0
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==2.0.4
+uvicorn==0.23.2
+wandb==0.15.8
+wcwidth==0.2.6
+websockets==11.0.3
+yarl==1.9.2
+zipp==3.16.2