Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import json | |
import numpy as np | |
from fuzzywuzzy import fuzz | |
import pinecone | |
from sentence_transformers import SentenceTransformer | |
pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp') | |
model = SentenceTransformer('all-mpnet-base-v2',device='cpu') | |
def process_string(s): | |
return s.lower().replace('&', 'and') | |
def levenshtein_distance(s1, s2): | |
return fuzz.ratio(s1, s2) | |
def compare_string_all(string, df): | |
string = string.lower().replace('&', 'and') | |
df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower())) | |
top_5_df = df.sort_values('distance', ascending=False).head(5) | |
top_5_df = top_5_df[['label','Ingredients', 'distance']] | |
return top_5_df | |
def compare_string_label(string, df): | |
string = string.lower().replace('&', 'and') | |
df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower())) | |
top_5_df = df.sort_values('distance', ascending=False).head(5) | |
top_5_df = top_5_df[['label','Ingredients', 'distance']] | |
return top_5_df | |
df= pd.read_json('cleaned.json') | |
df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients'] | |
df['cleaned_text']= df['label+ingradient'].apply(process_string) | |
df['cleaned_label'] = df['label'].apply(process_string) | |
index = pinecone.Index('companiessearch') | |
# Create a Streamlit app | |
def main(): | |
st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide") | |
st.title("Company name matching App :smiley:") | |
# Define pages | |
pages = ["Semantic search"] | |
# Add radio buttons to toggle between pages | |
page = st.sidebar.radio("Select a page", pages) | |
# if page == pages[0]: | |
# st.header("Matches using levenshtein_distance") | |
# st.write("Enter a menu along with its ingredients:") | |
# st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita") | |
# input_string = st.text_input("") | |
# input_string= process_string(input_string) | |
# if input_string: | |
# st.write("Top 5 matches:") | |
# if len(input_string.split())>4: | |
# top_matches = compare_string_all(input_string, df) | |
# else: | |
# top_matches= compare_string_label(input_string, df) | |
# st.dataframe(top_matches) | |
if page == pages[0]: | |
st.header("Matches using embeddings (semantic search)") | |
st.write("Enter a company name:") | |
st.write("e.g. Airtel Africa Plc") | |
input_string = st.text_input("") | |
input_string = process_string(input_string) | |
if st.button("Enter"): | |
st.write("Top 5 matches using semantic search:") | |
# if len(input_string.split()) > 4: | |
# top_matches = compare_string_all(input_string, df) | |
# else: | |
# top_matches = compare_string_label(input_string, df) | |
xq = model.encode([input_string]).tolist() | |
result = index.query(xq, top_k=10, includeMetadata=True) | |
Name=[] | |
Country=[] | |
score=[] | |
for matches in result['matches']: | |
Name.append(matches['metadata']['name']) | |
Country.append(matches['metadata']['Country']) | |
score.append(matches['score']) | |
final_result= pd.DataFrame(list(zip(Name, Country, score)), | |
columns =['Company_name', 'Country','score' ]) | |
st.dataframe(final_result) | |
if __name__ == "__main__": | |
main() | |