Spaces:
Runtime error
Runtime error
File size: 3,506 Bytes
25f2580 3ad4fe0 d2385a6 25f2580 d1c30b8 25f2580 d1c30b8 25f2580 d1c30b8 25f2580 d1c30b8 25f2580 d2385a6 016f019 3ad4fe0 25f2580 a290a0d 3ad4fe0 5f4aff6 5454958 3ad4fe0 5f4aff6 3ad4fe0 25f2580 3ad4fe0 0812987 3ad4fe0 d1c30b8 3ad4fe0 d2385a6 0812987 3ad4fe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import streamlit as st
import pandas as pd
import linktransformer as lt
# Function to convert DataFrame to CSV for download
def convert_df_to_csv(df):
return df.to_csv().encode('utf-8')
st.title('Merge Dataframes using LinkTransformer')
st.write('LinkTransformer supports several AI-powered data wrangling operations - here is an example that allows you to use LLMs to merge data.')
# Function to load DataFrame
def load_dataframe(upload):
##if csv is uploaded use read_csv to load the data , otherwise use read_excel
if upload is not None:
if upload.name.endswith('csv'):
return pd.read_csv(upload)
else:
return pd.read_excel(upload)
else:
return pd.DataFrame()
# Options for DataFrame 1
df1_upload = st.file_uploader("Upload DataFrame 1 (CSV)", type=['csv'], key='df1_upload')
# Options for DataFrame 2
df2_upload = st.file_uploader("Upload DataFrame 2 (CSV)", type=['csv'], key='df2_upload')
# Load and display the DataFrames
df1 = load_dataframe(df1_upload)
df2 = load_dataframe(df2_upload)
if df1 is not None:
st.write("DataFrame 1 Preview:")
st.dataframe(df1.head())
if df2 is not None:
st.write("DataFrame 2 Preview:")
st.dataframe(df2.head())
# Model selection
model_path = st.text_input("Model path (HuggingFace)", value="all-MiniLM-L6-v2")
st.write("We have trained several record linkage models! Just copy the Hugging Face model path from the [model zoo](https://linktransformer.github.io/).")
##More on model selection available on https://linktransformer.github.io/
if df1_upload is not None and df2_upload is not None:
# Checkbox for columns to match on
if not df1.empty and not df2.empty:
columns_df1 = df1.columns.tolist()
columns_df2 = df2.columns.tolist()
selected_columns_df1 = st.multiselect("Select columns from DataFrame 1 to match on:", columns_df1, default=columns_df1[0])
selected_columns_df2 = st.multiselect("Select columns from DataFrame 2 to match on:", columns_df2, default=columns_df2[0])
# Perform merge
if st.button("Merge DataFrames"):
model=lt.LinkTransformer(model_path)
df_lm_matched = lt.merge(df1, df2, merge_type='1:m', on=None, model=model, left_on=selected_columns_df1, right_on=selected_columns_df2)
st.write("Merged DataFrame Preview:")
st.dataframe(df_lm_matched.head())
# Download button for merged DataFrame
csv = convert_df_to_csv(df_lm_matched)
st.download_button(
label="Download as CSV",
data=csv,
file_name='merged_dataframe.csv',
mime='text/csv',
)
else:
st.write("It appears that your dataframes are empty. Please upload valid dataframes.")
else:
st.write("Please upload or enter paths for both DataFrames.")
##Add website and citation
st.write("Note that this space only supports CPU usage and is only recommended on small datasets. If you have access to the GPU, check out our python [package](https://github.com/dell-research-harvard/linktransformer/)!")
st.write("For more information and advanced usage, please visit the [LinkTransformer website](https://linktransformer.github.io/).")
st.write("If you use LinkTransformer in your research, please cite the following paper: [LinkTransformer: A Unified Package for Record Linkage with Transformer Language Models](https://arxiv.org/abs/2309.00789)")
|