File size: 3,035 Bytes
fd7fdc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import pandas as pd
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import time
from concurrent.futures import ThreadPoolExecutor

def process_string(s):
    return s.lower().replace('&', 'and')

#@st.cache
@st.cache_data
def load_model():
    return SentenceTransformer(r"finetiuned_model")

def process_embedding(ingredient, model):
    processed_ingredient = process_string(ingredient)
    return model.encode([processed_ingredient]).tolist()

def faiss_query(xq, index, top_k=1):
    distances, indices = index.search(np.array(xq).astype('float32'), top_k)
    return distances[0], indices[0]

def get_top_matches(ingredients_flat, ingredients, loaded_model, index):
    matches = []
    scores = []

    # Generate embeddings in parallel
    with ThreadPoolExecutor() as executor:
        embeddings = list(executor.map(lambda ing: process_embedding(ing, loaded_model), ingredients))

    # Query Faiss in parallel
    results = []
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda xq: faiss_query(xq, index), embeddings))

    # Extract matches and scores
    for distances, indices in results:
        if indices.size > 0:
            match = ingredients_flat[indices[0]]
            matches.append(match)
            scores.append(round(1 - distances[0] / 2, 2))

    return matches, scores

# Load the Faiss index from disk
index = faiss.read_index('faiss_index.bin')

# Load the metadata from the JSON file
with open('metadata_faiss.json', 'r') as f:
    metadata = json.load(f)

ingredients_flat = [item["Ingredient"] for item in metadata]
loaded_model = load_model()

def main():
    #st.set_page_config(page_title="Ingredients Matching App", page_icon=":smiley:", layout="wide")
    st.title("Ingredients name matching App :smiley:")

    st.header("Matches using embeddings (semantic search)")
    st.write("Enter the JSON input:")
    json_input = st.text_area("")

    if st.button("Process"):
        start_time = time.time()
        with st.spinner("Processing..."):
            try:
                input_data = json.loads(json_input)

                for menu_item in input_data:
                    ing_list = menu_item.get("ingredients", [])
                    matches, scores = get_top_matches(ingredients_flat, ing_list, loaded_model, index)
                    menu_item["Ingradients_matched"] = matches
                    menu_item["scores"] = scores

                #st.write("Processed JSON:")
                #st.write("<pre>" + json.dumps(input_data, indent=4) + "</pre>", unsafe_allow_html=True)
                output_df = pd.DataFrame(input_data)
                st.write("Processed Data:")
                st.write(output_df)


            except json.JSONDecodeError:
                st.error("Invalid JSON input. Please check and try again.")

        end_time = time.time()
        st.write(f"Processing time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()