company_name_matches_using_embeddings

Runtime error

App Files Files Community

company_name_matches_using_embeddings / app.py

amoldwalunj

Duplicate from amoldwalunj/matches_using_lavenstein_and_embeddings

819f923 almost 2 years ago

raw

history blame

3.62 kB

	import streamlit as st
	import pandas as pd
	import json
	import numpy as np
	from fuzzywuzzy import fuzz

	import pinecone
	from sentence_transformers import SentenceTransformer

	pinecone.init(api_key='5c5b5687-b73d-47e9-9cc8-e184ff72cc45', environment='us-central1-gcp')

	model = SentenceTransformer('all-mpnet-base-v2',device='cpu')

	def process_string(s):
	return s.lower().replace('&', 'and')

	def levenshtein_distance(s1, s2):
	return fuzz.ratio(s1, s2)

	def compare_string_all(string, df):
	string = string.lower().replace('&', 'and')

	df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))

	top_5_df = df.sort_values('distance', ascending=False).head(5)

	top_5_df = top_5_df[['label','Ingredients', 'distance']]

	return top_5_df

	def compare_string_label(string, df):
	string = string.lower().replace('&', 'and')

	df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))

	top_5_df = df.sort_values('distance', ascending=False).head(5)

	top_5_df = top_5_df[['label','Ingredients', 'distance']]

	return top_5_df

	df= pd.read_json('cleaned.json')

	df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']

	df['cleaned_text']= df['label+ingradient'].apply(process_string)

	df['cleaned_label'] = df['label'].apply(process_string)

	index = pinecone.Index('menuingradientsearch')


	# Create a Streamlit app
	def main():
	st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
	st.title("String Matching App :smiley:")

	# Define pages
	pages = ["Fuzzy match", "Semantic search"]

	# Add radio buttons to toggle between pages
	page = st.sidebar.radio("Select a page", pages)

	if page == pages[0]:
	st.header("Matches using levenshtein_distance")
	st.write("Enter a menu along with its ingredients:")
	st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
	input_string = st.text_input("")

	input_string= process_string(input_string)

	if input_string:
	st.write("Top 5 matches:")

	if len(input_string.split())>4:
	top_matches = compare_string_all(input_string, df)
	else:
	top_matches= compare_string_label(input_string, df)

	st.dataframe(top_matches)

	elif page == pages[1]:
	st.header("Matches using embeddings (semantic search)")
	st.write("Enter a menu along with its ingredients:")
	st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
	input_string = st.text_input("")

	input_string = process_string(input_string)

	if input_string:
	st.write("Top 10 matches using semantic search:")

	# if len(input_string.split()) > 4:
	# top_matches = compare_string_all(input_string, df)
	# else:
	# top_matches = compare_string_label(input_string, df)

	xq = model.encode([input_string]).tolist()
	result = index.query(xq, top_k=10, includeMetadata=True)

	labels=[]
	ingradients=[]
	score=[]
	for matches in result['matches']:
	labels.append(matches['metadata']['label'])
	ingradients.append(matches['metadata']['Ingredients'])
	score.append(matches['score'])

	final_result= pd.DataFrame(list(zip(labels, ingradients, score)),
	columns =['labels', 'ingradients','score' ])

	st.dataframe(final_result)

	if __name__ == "__main__":
	main()