REaLTabFormer / app.py
RamAnanth1's picture
Add explainer image
a2c5ca3
raw
history blame
6.86 kB
import gradio as gr
import pandas as pd
from realtabformer import REaLTabFormer
from scipy.io import arff
import os
rtf_model = REaLTabFormer(
model_type="tabular",
epochs=25, # Default is 200
gradient_accumulation_steps=4)
def generate_data(file, num_samples):
if '.arff' in file.name:
data = arff.loadarff(open(file.name,'rt'))
df = pd.DataFrame(data[0])
elif '.csv' in file.name:
df = pd.read_csv(file.name)
rtf_model.fit(df, num_bootstrap=10) # Default is 500
# Generate synthetic data
samples = rtf_model.sample(n_samples=num_samples)
return samples
def generate_relational_data(parent_file, child_file, join_on):
parent_df = pd.read_csv(parent_file.name)
child_df = pd.read_csv(child_file.name)
#Make sure join_on column exists in both
assert ((join_on in parent_df.columns) and
(join_on in child_df.columns))
rtf_model.fit(parent_df.drop(join_on, axis=1), num_bootstrap=100)
pdir = Path("rtf_parent/")
rtf_model.save(pdir)
# # Get the most recently saved parent model,
# # or a specify some other saved model.
# parent_model_path = pdir / "idXXX"
parent_model_path = sorted([
p for p in pdir.glob("id*") if p.is_dir()],
key=os.path.getmtime)[-1]
child_model = REaLTabFormer(
model_type="relational",
parent_realtabformer_path=parent_model_path,
epochs = 25,
output_max_length=None,
train_size=0.8)
child_model.fit(
df=child_df,
in_df=parent_df,
join_on=join_on,
num_bootstrap=10)
# Generate parent samples.
parent_samples = rtf_model.sample(5)
# Create the unique ids based on the index.
parent_samples.index.name = join_on
parent_samples = parent_samples.reset_index()
# Generate the relational observations.
child_samples = child_model.sample(
input_unique_ids=parent_samples[join_on],
input_df=parent_samples.drop(join_on, axis=1),
gen_batch=5)
return parent_samples, child_samples, gr.update(visible = True)
css = """
.gradio-container {
font-family: 'IBM Plex Sans', sans-serif;
}
.gr-button {
color: white;
border-color: black;
background: black;
}
input[type='range'] {
accent-color: black;
}
.dark input[type='range'] {
accent-color: #dfdfdf;
}
.container {
max-width: 430px;
margin: auto;
padding-top: 1.5rem;
}
#gallery {
min-height: 22rem;
margin-bottom: 15px;
margin-left: auto;
margin-right: auto;
border-bottom-right-radius: .5rem !important;
border-bottom-left-radius: .5rem !important;
}
#gallery>div>.h-full {
min-height: 20rem;
}
.details:hover {
text-decoration: underline;
}
.gr-button {
white-space: nowrap;
}
.gr-button:focus {
border-color: rgb(147 197 253 / var(--tw-border-opacity));
outline: none;
box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
--tw-border-opacity: 1;
--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
--tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
--tw-ring-opacity: .5;
}
#advanced-btn {
font-size: .7rem !important;
line-height: 19px;
margin-top: 12px;
margin-bottom: 12px;
padding: 2px 8px;
border-radius: 14px !important;
}
#advanced-options {
display: none;
margin-bottom: 20px;
}
.footer {
margin-bottom: 45px;
margin-top: 35px;
text-align: center;
border-bottom: 1px solid #e5e5e5;
}
.footer>p {
font-size: .8rem;
display: inline-block;
padding: 0 10px;
transform: translateY(10px);
background: white;
}
.dark .footer {
border-color: #303030;
}
.dark .footer>p {
background: #0b0f19;
}
"""
with gr.Blocks(css = css) as demo:
gr.Markdown("""
## REaLTabFormer: Generating Realistic Relational and Tabular Data using Transformers
""")
gr.HTML('''
<p style="margin-bottom: 10px; font-size: 94%">
This is an unofficial demo for REaLTabFormer, an approach that can be used to generate synthetic data from single tabular data using GPT. The demo is based on the <a href='https://github.com/avsolatorio/REaLTabFormer' style='text-decoration: underline;' target='_blank'> Github </a> implementation provided by the authors.
</p>
''')
gr.HTML('''
<p align="center"><img src="REalTabFormer_Final_EQ.png" style="width:75%"/></p>
''')
with gr.Column():
with gr.Tab("Upload Data as File: Tabular Data"):
data_input_u = gr.File(label = 'Upload Data File (Currently supports CSV and ARFF)', file_types=[".csv", ".arff"])
num_samples = gr.Slider(label="Number of Samples", minimum=5, maximum=100, value=5, step=10)
generate_data_btn = gr.Button('Generate Synthetic Data')
with gr.Tab("Upload Data as File: Relational Data"):
data_input_parent = gr.File(label = 'Upload Data File for Parent Dataset', file_types=[ ".csv"])
data_input_child = gr.File(label = 'Upload Data File for Child Dataset', file_types=[ ".csv"])
join_on = gr.Textbox(label = 'Column name to join on')
generate_data_btn_relational = gr.Button('Generate Synthetic Data')
with gr.Row():
#data_sample = gr.Dataframe(label = "Original Data")
data_output = gr.Dataframe(label = "Synthetic Data")
with gr.Row(visible = False) as child_sample:
data_output_child = gr.Dataframe(label = "Synthetic Data for Child Dataset")
generate_data_btn.click(generate_data, inputs = [data_input_u,num_samples], outputs = [data_output])
generate_data_btn_relational.click(generate_relational_data, inputs = [data_input_parent,data_input_child,join_on], outputs = [data_output, data_output_child])
examples = gr.Examples(examples=[['diabetes.arff',5], ["titanic.csv", 15]],inputs = [data_input_u,num_samples], outputs = [data_output], cache_examples = True, fn = generate_data)
demo.launch()