Spaces:
Runtime error
Runtime error
Commit
·
88e8643
1
Parent(s):
8183509
added example data
Browse files- .gitignore +3 -0
- app.py +21 -3
- src/core.py +5 -5
- src/data/actual/table_A.csv +11 -0
- src/data/actual/table_B.csv +11 -0
- src/data/actual/template.csv +11 -0
- src/prompt.py +3 -3
- src/types.py +1 -1
- wandb/run-20230818_001703-qce5cvts/files/requirements.txt +125 -0
.gitignore
CHANGED
@@ -2,3 +2,6 @@ venv/
|
|
2 |
__pycache__/
|
3 |
*.pyc
|
4 |
.env
|
|
|
|
|
|
|
|
2 |
__pycache__/
|
3 |
*.pyc
|
4 |
.env
|
5 |
+
*.json
|
6 |
+
wandb/
|
7 |
+
*.log
|
app.py
CHANGED
@@ -8,10 +8,15 @@ from src.core import (
|
|
8 |
|
9 |
MAX_ROWS = 10
|
10 |
|
11 |
-
|
12 |
def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
|
13 |
return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# TODO: use tempfile
|
17 |
def export_csv(df, filename):
|
@@ -20,12 +25,13 @@ def export_csv(df, filename):
|
|
20 |
|
21 |
|
22 |
# TODO: use tempfile
|
23 |
-
def export_text(
|
24 |
with open(filename, "w") as f:
|
25 |
-
f.write(
|
26 |
return gr.File.update(value=filename, visible=True)
|
27 |
|
28 |
|
|
|
29 |
with gr.Blocks() as demo:
|
30 |
gr.Markdown(
|
31 |
"# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
|
@@ -42,21 +48,33 @@ with gr.Blocks() as demo:
|
|
42 |
label="Upload Template File",
|
43 |
file_types=[".csv"],
|
44 |
file_count="single",
|
|
|
|
|
|
|
|
|
|
|
45 |
)
|
46 |
template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
|
47 |
upload_template_btn.upload(
|
48 |
fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
|
49 |
)
|
|
|
50 |
with gr.Column():
|
51 |
upload_source_button = gr.UploadButton(
|
52 |
label="Upload Source File",
|
53 |
file_types=[".csv"],
|
54 |
file_count="single",
|
|
|
|
|
|
|
|
|
|
|
55 |
)
|
56 |
source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
|
57 |
upload_source_button.upload(
|
58 |
fn=process_csv_text, inputs=upload_source_button, outputs=source_df
|
59 |
)
|
|
|
60 |
|
61 |
# STEP 2
|
62 |
generate_step_markdown(
|
|
|
8 |
|
9 |
MAX_ROWS = 10
|
10 |
|
11 |
+
import pandas as pd
|
12 |
def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
|
13 |
return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
|
14 |
|
15 |
+
example_df = pd.read_csv('./src/data/synthetic/legal_entries_a.csv')
|
16 |
+
def load_example_template(template_df, example_df):
|
17 |
+
return template_df.update(example_df)
|
18 |
+
|
19 |
+
|
20 |
|
21 |
# TODO: use tempfile
|
22 |
def export_csv(df, filename):
|
|
|
25 |
|
26 |
|
27 |
# TODO: use tempfile
|
28 |
+
def export_text(content, filename):
|
29 |
with open(filename, "w") as f:
|
30 |
+
f.write(content)
|
31 |
return gr.File.update(value=filename, visible=True)
|
32 |
|
33 |
|
34 |
+
|
35 |
with gr.Blocks() as demo:
|
36 |
gr.Markdown(
|
37 |
"# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
|
|
|
48 |
label="Upload Template File",
|
49 |
file_types=[".csv"],
|
50 |
file_count="single",
|
51 |
+
variant="primary",
|
52 |
+
)
|
53 |
+
load_template_btn = gr.Button(
|
54 |
+
value="Load Example Template File",
|
55 |
+
variant="secondary",
|
56 |
)
|
57 |
template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
|
58 |
upload_template_btn.upload(
|
59 |
fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
|
60 |
)
|
61 |
+
load_template_btn.click(lambda _: pd.read_csv('./src/data/actual/template.csv'), upload_template_btn, template_df)
|
62 |
with gr.Column():
|
63 |
upload_source_button = gr.UploadButton(
|
64 |
label="Upload Source File",
|
65 |
file_types=[".csv"],
|
66 |
file_count="single",
|
67 |
+
variant="primary",
|
68 |
+
)
|
69 |
+
load_source_button = gr.Button(
|
70 |
+
value="Load Example Source File",
|
71 |
+
variant="secondary",
|
72 |
)
|
73 |
source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
|
74 |
upload_source_button.upload(
|
75 |
fn=process_csv_text, inputs=upload_source_button, outputs=source_df
|
76 |
)
|
77 |
+
load_source_button.click(lambda _: pd.read_csv('./src/data/actual/table_A.csv'), upload_source_button, source_df)
|
78 |
|
79 |
# STEP 2
|
80 |
generate_step_markdown(
|
src/core.py
CHANGED
@@ -76,12 +76,12 @@ def generate_mapping_code(table_mapping_df) -> str:
|
|
76 |
return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
|
77 |
|
78 |
|
79 |
-
def process_csv_text(
|
80 |
-
"""Process a CSV file into a dataframe, either from a string or a file."""
|
81 |
-
if isinstance(
|
82 |
-
df = pd.read_csv(
|
83 |
else:
|
84 |
-
df = pd.read_csv(
|
85 |
return df
|
86 |
|
87 |
|
|
|
76 |
return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
|
77 |
|
78 |
|
79 |
+
def process_csv_text(value):
|
80 |
+
"""Process a CSV file into a dataframe, either from a string path or a file."""
|
81 |
+
if isinstance(value, str):
|
82 |
+
df = pd.read_csv(value)
|
83 |
else:
|
84 |
+
df = pd.read_csv(value.name)
|
85 |
return df
|
86 |
|
87 |
|
src/data/actual/table_A.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Date_of_Policy,FullName,Insurance_Plan,Policy_No,Monthly_Premium,Department,JobTitle,Policy_Start,Full_Name,Insurance_Type,Policy_Num,Monthly_Cost
|
2 |
+
05/01/2023,John Doe,Gold Plan,AB-12345,150.00,IT,Software Engineer,05/01/2023,John Doe,Gold,AB-12345,150.00
|
3 |
+
05/02/2023,Jane Smith,Silver Plan,CD-67890,100.00,HR,HR Manager,05/02/2023,Jane Smith,Silver,CD-67890,100.00
|
4 |
+
05/03/2023,Michael Brown,Bronze Plan,EF-10111,50.00,Marketing,Marketing Coordinator,05/03/2023,Michael Brown,Bronze,EF-10111,50.00
|
5 |
+
05/04/2023,Alice Johnson,Gold Plan,GH-12121,150.00,Finance,Financial Analyst,05/04/2023,Alice Johnson,Gold,GH-12121,150.00
|
6 |
+
05/05/2023,Bob Wilson,Silver Plan,IJ-13131,100.00,Sales,Sales Executive,05/05/2023,Bob Wilson,Silver,IJ-13131,100.00
|
7 |
+
05/06/2023,Carol Martinez,Bronze Plan,KL-14141,50.00,Operations,Operations Manager,05/06/2023,Carol Martinez,Bronze,KL-14141,50.00
|
8 |
+
05/07/2023,David Anderson,Gold Plan,MN-15151,150.00,Legal,Attorney,05/07/2023,David Anderson,Gold,MN-15151,150.00
|
9 |
+
05/08/2023,Eva Thomas,Silver Plan,OP-16161,100.00,Product,Product Manager,05/08/2023,Eva Thomas,Silver,OP-16161,100.00
|
10 |
+
05/09/2023,Frank Jackson,Bronze Plan,QR-17171,50.00,Engineering,Engineer,05/09/2023,Frank Jackson,Bronze,QR-17171,50.00
|
11 |
+
05/10/2023,Grace White,Gold Plan,ST-18181,150.00,Design,Graphic Designer,05/10/2023,Grace White,Gold,ST-18181,150.00
|
src/data/actual/table_B.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PolicyDate,Name,PlanType,Policy_ID,PremiumAmount,Hobby,MaritalStatus,StartDate,Employee_Name,Plan_Name,PolicyID,Cost
|
2 |
+
2023-05-01,Doe, John,GoldPackage,AB12345,Photography,Married,2023-05-01,John Doe,Gold,AB12345,150
|
3 |
+
2023-05-02,Smith, Jane,SilverPackage,CD67890,Reading,Single,2023-05-02,Jane Smith,Silver,CD67890,100
|
4 |
+
2023-05-03,Brown, Michael,BronzePackage,EF10111,Traveling,Divorced,2023-05-03,Michael Brown,Bronze,EF10111,50
|
5 |
+
2023-05-04,Johnson, Alice,GoldPackage,GH12121,Cooking,Married,2023-05-04,Alice Johnson,Gold,GH12121,150
|
6 |
+
2023-05-05,Wilson, Bob,SilverPackage,IJ13131,Hiking,Single,2023-05-05,Bob Wilson,Silver,IJ13131,100
|
7 |
+
2023-05-06,Martinez, Carol,BronzePackage,KL14141,Swimming,Divorced,2023-05-06,Carol Martinez,Bronze,KL14141,50
|
8 |
+
2023-05-07,Anderson, David,GoldPackage,MN15151,Gaming,Married,2023-05-07,David Anderson,Gold,MN15151,150
|
9 |
+
2023-05-08,Thomas, Eva,SilverPackage,OP16161,Painting,Single,2023-05-08,Eva Thomas,Silver,OP16161,100
|
10 |
+
2023-05-09,Jackson, Frank,BronzePackage,QR17171,Writing,Divorced,2023-05-09,Frank Jackson,Bronze,QR17171,50
|
11 |
+
2023-05-10,White, Grace,GoldPackage,ST18181,Dancing,Married,2023-05-10,Grace White,Gold,ST18181,150
|
src/data/actual/template.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Date,EmployeeName,Plan,PolicyNumber,Premium
|
2 |
+
01-05-2023,John Doe,Gold,AB12345,150
|
3 |
+
02-05-2023,Jane Smith,Silver,CD67890,100
|
4 |
+
03-05-2023,Michael Brown,Bronze,EF10111,50
|
5 |
+
04-05-2023,Alice Johnson,Gold,GH12121,150
|
6 |
+
05-05-2023,Bob Wilson,Silver,IJ13131,100
|
7 |
+
06-05-2023,Carol Martinez,Bronze,KL14141,50
|
8 |
+
07-05-2023,David Anderson,Gold,MN15151,150
|
9 |
+
08-05-2023,Eva Thomas,Silver,OP16161,100
|
10 |
+
09-05-2023,Frank Jackson,Bronze,QR17171,50
|
11 |
+
10-05-2023,Grace White,Gold,ST18181,150
|
src/prompt.py
CHANGED
@@ -12,10 +12,10 @@ Head of `target_csv`:
|
|
12 |
Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
|
13 |
|
14 |
For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
|
15 |
-
You
|
|
|
16 |
|
17 |
-
|
18 |
-
1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity.
|
19 |
|
20 |
Example mappings:
|
21 |
- 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.
|
|
|
12 |
Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
|
13 |
|
14 |
For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
|
15 |
+
You should consider the semantic meaning of the columns, not just the character similarity or positioning.
|
16 |
+
If there is no corresponding column in the `target_csv`, you should write 'DROP_COLUMN' in the `target_csv` column.
|
17 |
|
18 |
+
You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.
|
|
|
19 |
|
20 |
Example mappings:
|
21 |
- 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.
|
src/types.py
CHANGED
@@ -9,7 +9,7 @@ class TableMappingEntry(BaseModel):
|
|
9 |
)
|
10 |
target_column_name: str = Field(
|
11 |
...,
|
12 |
-
description="Name of the column in the target table, to which the source column maps.",
|
13 |
)
|
14 |
value_transformations: str = Field(
|
15 |
...,
|
|
|
9 |
)
|
10 |
target_column_name: str = Field(
|
11 |
...,
|
12 |
+
description="Name of the column in the target table, to which the source column maps. If there is no target_column, write 'DROP_COLUMN'.",
|
13 |
)
|
14 |
value_transformations: str = Field(
|
15 |
...,
|
wandb/run-20230818_001703-qce5cvts/files/requirements.txt
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohttp==3.8.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.0.1
|
5 |
+
anyio==3.7.1
|
6 |
+
appdirs==1.4.4
|
7 |
+
appnope==0.1.3
|
8 |
+
asttokens==2.2.1
|
9 |
+
async-timeout==4.0.3
|
10 |
+
attrs==23.1.0
|
11 |
+
backcall==0.2.0
|
12 |
+
black==23.7.0
|
13 |
+
certifi==2023.7.22
|
14 |
+
charset-normalizer==3.2.0
|
15 |
+
click==8.1.6
|
16 |
+
comm==0.1.4
|
17 |
+
contourpy==1.1.0
|
18 |
+
cycler==0.11.0
|
19 |
+
dataclasses-json==0.5.14
|
20 |
+
debugpy==1.6.7.post1
|
21 |
+
decorator==5.1.1
|
22 |
+
docker-pycreds==0.4.0
|
23 |
+
exceptiongroup==1.1.3
|
24 |
+
executing==1.2.0
|
25 |
+
fastapi==0.101.1
|
26 |
+
ffmpy==0.3.1
|
27 |
+
filelock==3.12.2
|
28 |
+
fonttools==4.42.0
|
29 |
+
frozenlist==1.4.0
|
30 |
+
fsspec==2023.6.0
|
31 |
+
gitdb==4.0.10
|
32 |
+
gitpython==3.1.32
|
33 |
+
gradio-client==0.4.0
|
34 |
+
gradio==3.40.1
|
35 |
+
greenlet==2.0.2
|
36 |
+
h11==0.14.0
|
37 |
+
httpcore==0.17.3
|
38 |
+
httpx==0.24.1
|
39 |
+
huggingface-hub==0.16.4
|
40 |
+
idna==3.4
|
41 |
+
importlib-metadata==6.8.0
|
42 |
+
importlib-resources==6.0.1
|
43 |
+
ipykernel==6.25.1
|
44 |
+
ipython==8.14.0
|
45 |
+
jedi==0.19.0
|
46 |
+
jinja2==3.1.2
|
47 |
+
jsonschema-specifications==2023.7.1
|
48 |
+
jsonschema==4.19.0
|
49 |
+
jupyter-client==8.3.0
|
50 |
+
jupyter-core==5.3.1
|
51 |
+
kiwisolver==1.4.4
|
52 |
+
langchain==0.0.264
|
53 |
+
langsmith==0.0.22
|
54 |
+
linkify-it-py==2.0.2
|
55 |
+
markdown-it-py==2.2.0
|
56 |
+
markupsafe==2.1.3
|
57 |
+
marshmallow==3.20.1
|
58 |
+
matplotlib-inline==0.1.6
|
59 |
+
matplotlib==3.7.2
|
60 |
+
mdit-py-plugins==0.3.3
|
61 |
+
mdurl==0.1.2
|
62 |
+
multidict==6.0.4
|
63 |
+
mypy-extensions==1.0.0
|
64 |
+
nest-asyncio==1.5.7
|
65 |
+
numexpr==2.8.5
|
66 |
+
numpy==1.25.2
|
67 |
+
openai==0.27.8
|
68 |
+
openapi-schema-pydantic==1.2.4
|
69 |
+
orjson==3.9.4
|
70 |
+
packaging==23.1
|
71 |
+
pandas==2.0.3
|
72 |
+
parso==0.8.3
|
73 |
+
pathspec==0.11.2
|
74 |
+
pathtools==0.1.2
|
75 |
+
pexpect==4.8.0
|
76 |
+
pickleshare==0.7.5
|
77 |
+
pillow==10.0.0
|
78 |
+
pip==23.2.1
|
79 |
+
platformdirs==3.10.0
|
80 |
+
prompt-toolkit==3.0.39
|
81 |
+
protobuf==4.24.0
|
82 |
+
psutil==5.9.5
|
83 |
+
ptyprocess==0.7.0
|
84 |
+
pure-eval==0.2.2
|
85 |
+
pydantic==1.10.12
|
86 |
+
pydub==0.25.1
|
87 |
+
pygments==2.16.1
|
88 |
+
pyparsing==3.0.9
|
89 |
+
python-dateutil==2.8.2
|
90 |
+
python-dotenv==1.0.0
|
91 |
+
python-multipart==0.0.6
|
92 |
+
pytz==2023.3
|
93 |
+
pyyaml==6.0.1
|
94 |
+
pyzmq==25.1.1
|
95 |
+
referencing==0.30.2
|
96 |
+
requests==2.31.0
|
97 |
+
rpds-py==0.9.2
|
98 |
+
semantic-version==2.10.0
|
99 |
+
sentry-sdk==1.29.2
|
100 |
+
setproctitle==1.3.2
|
101 |
+
setuptools==56.0.0
|
102 |
+
six==1.16.0
|
103 |
+
smmap==5.0.0
|
104 |
+
sniffio==1.3.0
|
105 |
+
sqlalchemy==2.0.19
|
106 |
+
stack-data==0.6.2
|
107 |
+
starlette==0.27.0
|
108 |
+
tabulate==0.9.0
|
109 |
+
tenacity==8.2.3
|
110 |
+
tomli==2.0.1
|
111 |
+
toolz==0.12.0
|
112 |
+
tornado==6.3.3
|
113 |
+
tqdm==4.66.1
|
114 |
+
traitlets==5.9.0
|
115 |
+
typing-extensions==4.7.1
|
116 |
+
typing-inspect==0.9.0
|
117 |
+
tzdata==2023.3
|
118 |
+
uc-micro-py==1.0.2
|
119 |
+
urllib3==2.0.4
|
120 |
+
uvicorn==0.23.2
|
121 |
+
wandb==0.15.8
|
122 |
+
wcwidth==0.2.6
|
123 |
+
websockets==11.0.3
|
124 |
+
yarl==1.9.2
|
125 |
+
zipp==3.16.2
|