File size: 3,506 Bytes
25f2580
 
 
 
 
 
 
 
3ad4fe0
d2385a6
25f2580
 
d1c30b8
 
25f2580
d1c30b8
 
 
 
25f2580
d1c30b8
25f2580
 
 
 
 
 
 
d1c30b8
 
25f2580
 
 
 
 
 
 
 
 
 
 
d2385a6
016f019
3ad4fe0
25f2580
a290a0d
3ad4fe0
 
 
 
5f4aff6
 
 
5454958
3ad4fe0
 
 
5f4aff6
3ad4fe0
 
25f2580
3ad4fe0
 
 
0812987
3ad4fe0
 
 
 
 
 
 
 
d1c30b8
 
3ad4fe0
 
d2385a6
0812987
3ad4fe0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import streamlit as st
import pandas as pd
import linktransformer as lt

# Function to convert DataFrame to CSV for download
def convert_df_to_csv(df):
    return df.to_csv().encode('utf-8')

st.title('Merge Dataframes using LinkTransformer')
st.write('LinkTransformer supports several AI-powered data wrangling operations - here is an example that allows you to use LLMs to merge data.')

# Function to load DataFrame
def load_dataframe(upload):
    ##if csv is uploaded use read_csv to load the data , otherwise use read_excel
    if upload is not None:
        if upload.name.endswith('csv'):
            return pd.read_csv(upload)
        else:
            return pd.read_excel(upload)
    else:
        return pd.DataFrame()
# Options for DataFrame 1
df1_upload = st.file_uploader("Upload DataFrame 1 (CSV)", type=['csv'], key='df1_upload')

# Options for DataFrame 2
df2_upload = st.file_uploader("Upload DataFrame 2 (CSV)", type=['csv'], key='df2_upload')

# Load and display the DataFrames
df1 = load_dataframe(df1_upload)
df2 = load_dataframe(df2_upload)

if df1 is not None:
    st.write("DataFrame 1 Preview:")
    st.dataframe(df1.head())

if df2 is not None:
    st.write("DataFrame 2 Preview:")
    st.dataframe(df2.head())


# Model selection
model_path = st.text_input("Model path (HuggingFace)", value="all-MiniLM-L6-v2")
st.write("We have trained several record linkage models! Just copy the Hugging Face model path from the [model zoo](https://linktransformer.github.io/).")
##More on model selection available on https://linktransformer.github.io/

if df1_upload is not None and df2_upload is not None:
    # Checkbox for columns to match on
    if not df1.empty and not df2.empty:
        columns_df1 = df1.columns.tolist()
        columns_df2 = df2.columns.tolist()
        selected_columns_df1 = st.multiselect("Select columns from DataFrame 1 to match on:", columns_df1, default=columns_df1[0])
        selected_columns_df2 = st.multiselect("Select columns from DataFrame 2 to match on:", columns_df2, default=columns_df2[0])


         # Perform merge
        if st.button("Merge DataFrames"):
            model=lt.LinkTransformer(model_path)
            df_lm_matched = lt.merge(df1, df2, merge_type='1:m', on=None, model=model, left_on=selected_columns_df1, right_on=selected_columns_df2)
            st.write("Merged DataFrame Preview:")
            st.dataframe(df_lm_matched.head())

            # Download button for merged DataFrame
            csv = convert_df_to_csv(df_lm_matched)
            st.download_button(
                label="Download as CSV",
                data=csv,
                file_name='merged_dataframe.csv',
                mime='text/csv',
            )
    else:
        st.write("It appears that your dataframes are empty. Please upload valid dataframes.")

       
else:
    st.write("Please upload or enter paths for both DataFrames.")
            
##Add website and citation
st.write("Note that this space only supports CPU usage and is only recommended on small datasets. If you have access to the GPU, check out our python [package](https://github.com/dell-research-harvard/linktransformer/)!")
st.write("For more information and advanced usage, please visit the [LinkTransformer website](https://linktransformer.github.io/).")
st.write("If you use LinkTransformer in your research, please cite the following paper: [LinkTransformer: A Unified Package for Record Linkage with Transformer Language Models](https://arxiv.org/abs/2309.00789)")