File size: 3,619 Bytes
819f923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import pandas as pd
import json
import numpy as np
from fuzzywuzzy import fuzz

import pinecone
from sentence_transformers import SentenceTransformer

pinecone.init(api_key='5c5b5687-b73d-47e9-9cc8-e184ff72cc45', environment='us-central1-gcp')

model = SentenceTransformer('all-mpnet-base-v2',device='cpu')

def process_string(s):
    return s.lower().replace('&', 'and')

def levenshtein_distance(s1, s2):
    return fuzz.ratio(s1, s2)

def compare_string_all(string, df):
    string = string.lower().replace('&', 'and')

    df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))

    top_5_df = df.sort_values('distance', ascending=False).head(5)

    top_5_df = top_5_df[['label','Ingredients', 'distance']]

    return top_5_df

def compare_string_label(string, df):
    string = string.lower().replace('&', 'and')

    df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))

    top_5_df = df.sort_values('distance', ascending=False).head(5)

    top_5_df = top_5_df[['label','Ingredients', 'distance']]

    return top_5_df

df= pd.read_json('cleaned.json')

df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']

df['cleaned_text']= df['label+ingradient'].apply(process_string)

df['cleaned_label'] = df['label'].apply(process_string)

index = pinecone.Index('menuingradientsearch')


# Create a Streamlit app
def main():
    st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
    st.title("String Matching App :smiley:")

    # Define pages
    pages = ["Fuzzy match", "Semantic search"]

    # Add radio buttons to toggle between pages
    page = st.sidebar.radio("Select a page", pages)

    if page == pages[0]:
        st.header("Matches using levenshtein_distance")
        st.write("Enter a menu along with its ingredients:")
        st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
        input_string = st.text_input("")

        input_string= process_string(input_string)

        if input_string:
            st.write("Top 5 matches:")

            if len(input_string.split())>4: 
                top_matches = compare_string_all(input_string, df)
            else:
                top_matches= compare_string_label(input_string, df)

            st.dataframe(top_matches)

    elif page == pages[1]:
        st.header("Matches using embeddings (semantic search)")
        st.write("Enter a menu along with its ingredients:")
        st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
        input_string = st.text_input("")

        input_string = process_string(input_string)

        if input_string:
            st.write("Top 10 matches using semantic search:")

            # if len(input_string.split()) > 4:
            #     top_matches = compare_string_all(input_string, df)
            # else:
            #     top_matches = compare_string_label(input_string, df)

            xq = model.encode([input_string]).tolist()
            result = index.query(xq, top_k=10, includeMetadata=True)

            labels=[]
            ingradients=[]
            score=[]
            for matches in result['matches']:
                labels.append(matches['metadata']['label'])
                ingradients.append(matches['metadata']['Ingredients'])
                score.append(matches['score'])

            final_result= pd.DataFrame(list(zip(labels, ingradients, score)),
               columns =['labels', 'ingradients','score' ])

            st.dataframe(final_result)

if __name__ == "__main__":
    main()